Warn about duplicate URL contents.

2026-05-27 07:13:59 +00:00 · 2012-09-17 19:49:50 +02:00 · 2012-09-17 19:49:50 +02:00 · 4e59056ee7
commit 4e59056ee7
parent 02a09dbb28
9 changed files with 113 additions and 14 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -3,13 +3,14 @@
 Features:
 - checking: Allow specification of maximum checking time or maximum
  number of checked URLs.
+- checking: Send a HTTP Do-Not-Track header.
+- checking: Check URL length. Print error on URL longer than 2000 characters,
+  warning for longer than 255 characters.
+- checking: Warn about duplicate URL contents.

 Changes:
 - doc: Mention 7-zip to extract the .tar.xz under Windows.
  Closes: SF bug #3564733
- checking: Send a HTTP Do-Not-Track header.
- checking: Check URL length. Print error on URL longer than 2000 characters,
-  warning for longer than 255 characters.
 - logging: Print download and cache statistics in text output logger.
 - logging: Print warning tag in text output logger. Makes warning filtering
  more easy.
--- a/linkcheck/cache/content.py
+++ b/linkcheck/cache/content.py
@ -0,0 +1,71 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2012 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+Cache for content checksums.
+"""
+import hashlib
+from ..lock import get_lock
+from ..decorators import synchronized
+
+_lock = get_lock("checksums")
+
+class ChecksumInfo(object):
+
+    def __init__(self):
+        """Initialize checksums and cache statistics."""
+        # {hash -> [URL]}
+        self.checksums = {}
+        self.misses = self.hits = 0
+
+    def get_checksum_urls(self, url, checksum):
+        """Look for and store checksum for URL.
+        @param url: the URL for the checksum
+        @ptype url: unicode
+        @param checksum: the URL content checksum
+        @ptype checksum: str
+        @return: list of URLs matching the given checksum (except the given URL)
+        @rtype: list of unicode
+        """
+        if checksum in self.checksums:
+            self.hits += 1
+            urls = self.checksums[checksum]
+            if url in urls:
+                res = [x for x in urls if x != url]
+            else:
+                res = urls[:]
+                urls.append(url)
+        else:
+            self.misses += 1
+            res = []
+            self.checksums[checksum] = [url]
+        return res
+
+
+_checksuminfo = ChecksumInfo()
+
+@synchronized(_lock)
+def get_checksum_urls(url, content):
+    """See if given URL content is already stored under another URL.
+    @param url: the URL for which the content is valid
+    @ptype url: unicode
+    @param content: the content to hash
+    @ptype content: str
+    @return: list of URLs with the same content (except the given URL)
+    @rtype: list of unicode"""
+    checksum = hashlib.sha1(content).hexdigest()
+    return _checksuminfo.get_checksum_urls(url, checksum)
+
--- a/linkcheck/checker/const.py
+++ b/linkcheck/checker/const.py
@ -80,6 +80,7 @@ WARN_URL_EFFECTIVE_URL = "url-effective-url"
 WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
 WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found"
 WARN_URL_WARNREGEX_FOUND = "url-warnregex-found"
+WARN_URL_CONTENT_DUPLICATE = "url-content-duplicate"
 WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
 WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
 WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
@ -115,6 +116,7 @@ Warnings = {
    WARN_URL_ANCHOR_NOT_FOUND: _("URL anchor was not found."),
    WARN_URL_WARNREGEX_FOUND:
        _("The warning regular expression was found in the URL contents."),
+    WARN_URL_CONTENT_DUPLICATE: _("The URL content is a duplicate of another URL."),
    WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
    WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
    WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@ -220,7 +220,8 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
            buf = StringIO()
            def stor_data (s):
                """Helper method storing given data"""
-                self.aggregate.add_download_bytes(len(s))
+                urls = self.aggregate.add_download_data(self.cache_content_key, s)
+                self.warn_duplicate_content(urls)
                # limit the download size
                if (buf.tell() + len(s)) > self.MaxFilesizeBytes:
                    raise LinkCheckerError(_("FTP file size too large"))
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -695,7 +695,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        read_content()"""
        data = response.read()
        self._size = len(data)
-        self.aggregate.add_download_bytes(self._size)
+        urls = self.aggregate.add_download_data(self.cache_content_key, data)
+        self.warn_duplicate_content(urls)
        encoding = headers.get_content_encoding(self.headers)
        if encoding in _supported_encodings:
            try:
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -41,6 +41,7 @@ from .const import (WARN_URL_EFFECTIVE_URL,
    WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
    WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
    WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH,
+    WARN_URL_CONTENT_DUPLICATE,
    ExcList, ExcSyntaxList, ExcNoCacheList)

 # helper alias
@ -745,9 +746,19 @@ class UrlBase (object):
            raise LinkCheckerError(_("File size too large"))
        data = self.url_connection.read()
        if not self.is_local():
-            self.aggregate.add_download_bytes(len(data))
+            urls = self.aggregate.add_download_data(self.cache_content_key, data)
+            self.warn_duplicate_content(urls)
        return data, len(data)

+    def warn_duplicate_content(self, urls):
+        """If given URL list is not empty, warn about duplicate URL content.
+        @param urls: URLs with duplicate content
+        @ptype urls: list of unicode
+        """
+        if urls:
+            args = dict(urls=u",".join(urls), size=strformat.strsize(self.size))
+            self.add_warning(_("Content with %(size)s is the same as in URLs (%(urls)s).") % args, tag=WARN_URL_CONTENT_DUPLICATE)
+
    def check_content (self):
        """Check content data for warnings, syntax errors, viruses etc."""
        if not (self.valid and self.can_get_content()):
--- a/linkcheck/director/aggregator.py
+++ b/linkcheck/director/aggregator.py
@ -21,7 +21,7 @@ import time
 import threading
 from .. import log, LOG_CHECK
 from ..decorators import synchronized
-from ..cache import urlqueue, addrinfo
+from ..cache import urlqueue, addrinfo, content
 from . import logger, status, checker, cleanup


@ -136,12 +136,17 @@ class Aggregate (object):
        self.last_w3_call = time.time()

    @synchronized(_download_lock)
-    def add_download_bytes(self, bytes):
-        """Add gibven bytes to number of downloaded bytes.
-        @param bytes: number of bytes downloaded
-        @ptype bytes: int
+    def add_download_data(self, url, data):
+        """Add given downloaded data.
+        @param url: URL which data belongs to
+        @ptype url: unicode
+        @param data: downloaded data
+        @ptype data: string
+        @return: URLs with duplicate contents
+        @rtype: list of unicode
        """
-        self.downloaded_bytes += bytes
+        self.downloaded_bytes += len(data)
+        return content.get_checksum_urls(url, data)

    def gather_statistics(self):
        """Gather download and cache statistics and send them to the
--- a/tests/checker/data/http.html.result
+++ b/tests/checker/data/http.html.result
@ -123,12 +123,14 @@ url /?d=directory&p=page1
 cache key http://localhost:%(port)d/?d=directory&p=page1
 real url http://localhost:%(port)d/?d=directory&p=page1
 name should not be cached
+warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page).
 valid

 url /?quoted=ü
 cache key http://localhost:%(port)d/?quoted=%%C3%%BC
 real url http://localhost:%(port)d/?quoted=%%C3%%BC
 name html entities
+warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page,http://localhost:%(port)d/?d=directory&p=page1).
 valid

 url clsid:12345-67890
--- a/tests/checker/test_mail.py
+++ b/tests/checker/test_mail.py
@ -152,9 +152,14 @@ class TestMail (LinkCheckTest):
        self.mail_error(u"mailto:@")
        self.mail_error(u"mailto:@example.org")
        self.mail_error(u"mailto:a@")
-        self.mail_error(u"mailto:%s@%s" % (u"a"*60, u"b"*200))
+        url_too_long = "URL length %d is longer than 255."
+        url = u"mailto:%s@%s" % (u"a"*60, u"b"*200)
+        warning = url_too_long % len(url)
+        self.mail_error(url, warning=warning)
+        url = u"mailto:a@%s" % (u"a"*256)
+        warning = url_too_long % len(url)
+        self.mail_error(url, warning=warning)
        self.mail_error(u"mailto:%s@example.org" % (u"a"*65))
-        self.mail_error(u"mailto:a@%s" % (u"a"*256))
        self.mail_error(u'mailto:a@%s.com' % (u"a"*64))
        # local part quoted
        self.mail_error(u'mailto:"a""@example.com', cache_key=u'mailto:a')