Warn about duplicate URL contents.

This commit is contained in:
Bastian Kleineidam 2012-09-17 19:49:50 +02:00
parent 02a09dbb28
commit 4e59056ee7
9 changed files with 113 additions and 14 deletions

View file

@ -3,13 +3,14 @@
Features:
- checking: Allow specification of maximum checking time or maximum
number of checked URLs.
- checking: Send a HTTP Do-Not-Track header.
- checking: Check URL length. Print error on URL longer than 2000 characters,
warning for longer than 255 characters.
- checking: Warn about duplicate URL contents.
Changes:
- doc: Mention 7-zip to extract the .tar.xz under Windows.
Closes: SF bug #3564733
- checking: Send a HTTP Do-Not-Track header.
- checking: Check URL length. Print error on URL longer than 2000 characters,
warning for longer than 255 characters.
- logging: Print download and cache statistics in text output logger.
- logging: Print warning tag in text output logger. Makes warning filtering
more easy.

71
linkcheck/cache/content.py vendored Normal file
View file

@ -0,0 +1,71 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Cache for content checksums.
"""
import hashlib
from ..lock import get_lock
from ..decorators import synchronized
_lock = get_lock("checksums")
class ChecksumInfo(object):
def __init__(self):
"""Initialize checksums and cache statistics."""
# {hash -> [URL]}
self.checksums = {}
self.misses = self.hits = 0
def get_checksum_urls(self, url, checksum):
"""Look for and store checksum for URL.
@param url: the URL for the checksum
@ptype url: unicode
@param checksum: the URL content checksum
@ptype checksum: str
@return: list of URLs matching the given checksum (except the given URL)
@rtype: list of unicode
"""
if checksum in self.checksums:
self.hits += 1
urls = self.checksums[checksum]
if url in urls:
res = [x for x in urls if x != url]
else:
res = urls[:]
urls.append(url)
else:
self.misses += 1
res = []
self.checksums[checksum] = [url]
return res
_checksuminfo = ChecksumInfo()
@synchronized(_lock)
def get_checksum_urls(url, content):
"""See if given URL content is already stored under another URL.
@param url: the URL for which the content is valid
@ptype url: unicode
@param content: the content to hash
@ptype content: str
@return: list of URLs with the same content (except the given URL)
@rtype: list of unicode"""
checksum = hashlib.sha1(content).hexdigest()
return _checksuminfo.get_checksum_urls(url, checksum)

View file

@ -80,6 +80,7 @@ WARN_URL_EFFECTIVE_URL = "url-effective-url"
WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found"
WARN_URL_WARNREGEX_FOUND = "url-warnregex-found"
WARN_URL_CONTENT_DUPLICATE = "url-content-duplicate"
WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
@ -115,6 +116,7 @@ Warnings = {
WARN_URL_ANCHOR_NOT_FOUND: _("URL anchor was not found."),
WARN_URL_WARNREGEX_FOUND:
_("The warning regular expression was found in the URL contents."),
WARN_URL_CONTENT_DUPLICATE: _("The URL content is a duplicate of another URL."),
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),

View file

@ -220,7 +220,8 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
buf = StringIO()
def stor_data (s):
"""Helper method storing given data"""
self.aggregate.add_download_bytes(len(s))
urls = self.aggregate.add_download_data(self.cache_content_key, s)
self.warn_duplicate_content(urls)
# limit the download size
if (buf.tell() + len(s)) > self.MaxFilesizeBytes:
raise LinkCheckerError(_("FTP file size too large"))

View file

@ -695,7 +695,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
read_content()"""
data = response.read()
self._size = len(data)
self.aggregate.add_download_bytes(self._size)
urls = self.aggregate.add_download_data(self.cache_content_key, data)
self.warn_duplicate_content(urls)
encoding = headers.get_content_encoding(self.headers)
if encoding in _supported_encodings:
try:

View file

@ -41,6 +41,7 @@ from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH,
WARN_URL_CONTENT_DUPLICATE,
ExcList, ExcSyntaxList, ExcNoCacheList)
# helper alias
@ -745,9 +746,19 @@ class UrlBase (object):
raise LinkCheckerError(_("File size too large"))
data = self.url_connection.read()
if not self.is_local():
self.aggregate.add_download_bytes(len(data))
urls = self.aggregate.add_download_data(self.cache_content_key, data)
self.warn_duplicate_content(urls)
return data, len(data)
def warn_duplicate_content(self, urls):
"""If given URL list is not empty, warn about duplicate URL content.
@param urls: URLs with duplicate content
@ptype urls: list of unicode
"""
if urls:
args = dict(urls=u",".join(urls), size=strformat.strsize(self.size))
self.add_warning(_("Content with %(size)s is the same as in URLs (%(urls)s).") % args, tag=WARN_URL_CONTENT_DUPLICATE)
def check_content (self):
"""Check content data for warnings, syntax errors, viruses etc."""
if not (self.valid and self.can_get_content()):

View file

@ -21,7 +21,7 @@ import time
import threading
from .. import log, LOG_CHECK
from ..decorators import synchronized
from ..cache import urlqueue, addrinfo
from ..cache import urlqueue, addrinfo, content
from . import logger, status, checker, cleanup
@ -136,12 +136,17 @@ class Aggregate (object):
self.last_w3_call = time.time()
@synchronized(_download_lock)
def add_download_bytes(self, bytes):
"""Add gibven bytes to number of downloaded bytes.
@param bytes: number of bytes downloaded
@ptype bytes: int
def add_download_data(self, url, data):
"""Add given downloaded data.
@param url: URL which data belongs to
@ptype url: unicode
@param data: downloaded data
@ptype data: string
@return: URLs with duplicate contents
@rtype: list of unicode
"""
self.downloaded_bytes += bytes
self.downloaded_bytes += len(data)
return content.get_checksum_urls(url, data)
def gather_statistics(self):
"""Gather download and cache statistics and send them to the

View file

@ -123,12 +123,14 @@ url /?d=directory&p=page1
cache key http://localhost:%(port)d/?d=directory&p=page1
real url http://localhost:%(port)d/?d=directory&p=page1
name should not be cached
warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page).
valid
url /?quoted=ü
cache key http://localhost:%(port)d/?quoted=%%C3%%BC
real url http://localhost:%(port)d/?quoted=%%C3%%BC
name html entities
warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page,http://localhost:%(port)d/?d=directory&p=page1).
valid
url clsid:12345-67890

View file

@ -152,9 +152,14 @@ class TestMail (LinkCheckTest):
self.mail_error(u"mailto:@")
self.mail_error(u"mailto:@example.org")
self.mail_error(u"mailto:a@")
self.mail_error(u"mailto:%s@%s" % (u"a"*60, u"b"*200))
url_too_long = "URL length %d is longer than 255."
url = u"mailto:%s@%s" % (u"a"*60, u"b"*200)
warning = url_too_long % len(url)
self.mail_error(url, warning=warning)
url = u"mailto:a@%s" % (u"a"*256)
warning = url_too_long % len(url)
self.mail_error(url, warning=warning)
self.mail_error(u"mailto:%s@example.org" % (u"a"*65))
self.mail_error(u"mailto:a@%s" % (u"a"*256))
self.mail_error(u'mailto:a@%s.com' % (u"a"*64))
# local part quoted
self.mail_error(u'mailto:"a""@example.com', cache_key=u'mailto:a')