diff --git a/doc/changelog.txt b/doc/changelog.txt index cd1d3e7c..4a7343ee 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -3,13 +3,14 @@ Features: - checking: Allow specification of maximum checking time or maximum number of checked URLs. +- checking: Send a HTTP Do-Not-Track header. +- checking: Check URL length. Print error on URL longer than 2000 characters, + warning for longer than 255 characters. +- checking: Warn about duplicate URL contents. Changes: - doc: Mention 7-zip to extract the .tar.xz under Windows. Closes: SF bug #3564733 -- checking: Send a HTTP Do-Not-Track header. -- checking: Check URL length. Print error on URL longer than 2000 characters, - warning for longer than 255 characters. - logging: Print download and cache statistics in text output logger. - logging: Print warning tag in text output logger. Makes warning filtering more easy. diff --git a/linkcheck/cache/content.py b/linkcheck/cache/content.py new file mode 100644 index 00000000..55fc3110 --- /dev/null +++ b/linkcheck/cache/content.py @@ -0,0 +1,71 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2012 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Cache for content checksums. +""" +import hashlib +from ..lock import get_lock +from ..decorators import synchronized + +_lock = get_lock("checksums") + +class ChecksumInfo(object): + + def __init__(self): + """Initialize checksums and cache statistics.""" + # {hash -> [URL]} + self.checksums = {} + self.misses = self.hits = 0 + + def get_checksum_urls(self, url, checksum): + """Look for and store checksum for URL. + @param url: the URL for the checksum + @ptype url: unicode + @param checksum: the URL content checksum + @ptype checksum: str + @return: list of URLs matching the given checksum (except the given URL) + @rtype: list of unicode + """ + if checksum in self.checksums: + self.hits += 1 + urls = self.checksums[checksum] + if url in urls: + res = [x for x in urls if x != url] + else: + res = urls[:] + urls.append(url) + else: + self.misses += 1 + res = [] + self.checksums[checksum] = [url] + return res + + +_checksuminfo = ChecksumInfo() + +@synchronized(_lock) +def get_checksum_urls(url, content): + """See if given URL content is already stored under another URL. + @param url: the URL for which the content is valid + @ptype url: unicode + @param content: the content to hash + @ptype content: str + @return: list of URLs with the same content (except the given URL) + @rtype: list of unicode""" + checksum = hashlib.sha1(content).hexdigest() + return _checksuminfo.get_checksum_urls(url, checksum) + diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index 9600f6d4..81557c59 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -80,6 +80,7 @@ WARN_URL_EFFECTIVE_URL = "url-effective-url" WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content" WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found" WARN_URL_WARNREGEX_FOUND = "url-warnregex-found" +WARN_URL_CONTENT_DUPLICATE = "url-content-duplicate" WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large" WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero" WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal" @@ -115,6 +116,7 @@ Warnings = { WARN_URL_ANCHOR_NOT_FOUND: _("URL anchor was not found."), WARN_URL_WARNREGEX_FOUND: _("The warning regular expression was found in the URL contents."), + WARN_URL_CONTENT_DUPLICATE: _("The URL content is a duplicate of another URL."), WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."), WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."), WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."), diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index 9a32f621..fcb42c3e 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -220,7 +220,8 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): buf = StringIO() def stor_data (s): """Helper method storing given data""" - self.aggregate.add_download_bytes(len(s)) + urls = self.aggregate.add_download_data(self.cache_content_key, s) + self.warn_duplicate_content(urls) # limit the download size if (buf.tell() + len(s)) > self.MaxFilesizeBytes: raise LinkCheckerError(_("FTP file size too large")) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 5a2a6dc8..68d0637d 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -695,7 +695,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): read_content()""" data = response.read() self._size = len(data) - self.aggregate.add_download_bytes(self._size) + urls = self.aggregate.add_download_data(self.cache_content_key, data) + self.warn_duplicate_content(urls) encoding = headers.get_content_encoding(self.headers) if encoding in _supported_encodings: try: diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index f44fc869..d9003b0b 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -41,6 +41,7 @@ from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE, WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH, + WARN_URL_CONTENT_DUPLICATE, ExcList, ExcSyntaxList, ExcNoCacheList) # helper alias @@ -745,9 +746,19 @@ class UrlBase (object): raise LinkCheckerError(_("File size too large")) data = self.url_connection.read() if not self.is_local(): - self.aggregate.add_download_bytes(len(data)) + urls = self.aggregate.add_download_data(self.cache_content_key, data) + self.warn_duplicate_content(urls) return data, len(data) + def warn_duplicate_content(self, urls): + """If given URL list is not empty, warn about duplicate URL content. + @param urls: URLs with duplicate content + @ptype urls: list of unicode + """ + if urls: + args = dict(urls=u",".join(urls), size=strformat.strsize(self.size)) + self.add_warning(_("Content with %(size)s is the same as in URLs (%(urls)s).") % args, tag=WARN_URL_CONTENT_DUPLICATE) + def check_content (self): """Check content data for warnings, syntax errors, viruses etc.""" if not (self.valid and self.can_get_content()): diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py index cdb5d615..ad7e223e 100644 --- a/linkcheck/director/aggregator.py +++ b/linkcheck/director/aggregator.py @@ -21,7 +21,7 @@ import time import threading from .. import log, LOG_CHECK from ..decorators import synchronized -from ..cache import urlqueue, addrinfo +from ..cache import urlqueue, addrinfo, content from . import logger, status, checker, cleanup @@ -136,12 +136,17 @@ class Aggregate (object): self.last_w3_call = time.time() @synchronized(_download_lock) - def add_download_bytes(self, bytes): - """Add gibven bytes to number of downloaded bytes. - @param bytes: number of bytes downloaded - @ptype bytes: int + def add_download_data(self, url, data): + """Add given downloaded data. + @param url: URL which data belongs to + @ptype url: unicode + @param data: downloaded data + @ptype data: string + @return: URLs with duplicate contents + @rtype: list of unicode """ - self.downloaded_bytes += bytes + self.downloaded_bytes += len(data) + return content.get_checksum_urls(url, data) def gather_statistics(self): """Gather download and cache statistics and send them to the diff --git a/tests/checker/data/http.html.result b/tests/checker/data/http.html.result index b91a0e2c..8b80ce5c 100644 --- a/tests/checker/data/http.html.result +++ b/tests/checker/data/http.html.result @@ -123,12 +123,14 @@ url /?d=directory&p=page1 cache key http://localhost:%(port)d/?d=directory&p=page1 real url http://localhost:%(port)d/?d=directory&p=page1 name should not be cached +warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page). valid url /?quoted=ΓΌ cache key http://localhost:%(port)d/?quoted=%%C3%%BC real url http://localhost:%(port)d/?quoted=%%C3%%BC name html entities +warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page,http://localhost:%(port)d/?d=directory&p=page1). valid url clsid:12345-67890 diff --git a/tests/checker/test_mail.py b/tests/checker/test_mail.py index 3d99599c..3b253068 100644 --- a/tests/checker/test_mail.py +++ b/tests/checker/test_mail.py @@ -152,9 +152,14 @@ class TestMail (LinkCheckTest): self.mail_error(u"mailto:@") self.mail_error(u"mailto:@example.org") self.mail_error(u"mailto:a@") - self.mail_error(u"mailto:%s@%s" % (u"a"*60, u"b"*200)) + url_too_long = "URL length %d is longer than 255." + url = u"mailto:%s@%s" % (u"a"*60, u"b"*200) + warning = url_too_long % len(url) + self.mail_error(url, warning=warning) + url = u"mailto:a@%s" % (u"a"*256) + warning = url_too_long % len(url) + self.mail_error(url, warning=warning) self.mail_error(u"mailto:%s@example.org" % (u"a"*65)) - self.mail_error(u"mailto:a@%s" % (u"a"*256)) self.mail_error(u'mailto:a@%s.com' % (u"a"*64)) # local part quoted self.mail_error(u'mailto:"a""@example.com', cache_key=u'mailto:a')