mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-11 18:10:58 +00:00
Warn about duplicate URL contents.
This commit is contained in:
parent
02a09dbb28
commit
4e59056ee7
9 changed files with 113 additions and 14 deletions
|
|
@ -3,13 +3,14 @@
|
|||
Features:
|
||||
- checking: Allow specification of maximum checking time or maximum
|
||||
number of checked URLs.
|
||||
- checking: Send a HTTP Do-Not-Track header.
|
||||
- checking: Check URL length. Print error on URL longer than 2000 characters,
|
||||
warning for longer than 255 characters.
|
||||
- checking: Warn about duplicate URL contents.
|
||||
|
||||
Changes:
|
||||
- doc: Mention 7-zip to extract the .tar.xz under Windows.
|
||||
Closes: SF bug #3564733
|
||||
- checking: Send a HTTP Do-Not-Track header.
|
||||
- checking: Check URL length. Print error on URL longer than 2000 characters,
|
||||
warning for longer than 255 characters.
|
||||
- logging: Print download and cache statistics in text output logger.
|
||||
- logging: Print warning tag in text output logger. Makes warning filtering
|
||||
more easy.
|
||||
|
|
|
|||
71
linkcheck/cache/content.py
vendored
Normal file
71
linkcheck/cache/content.py
vendored
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Cache for content checksums.
|
||||
"""
|
||||
import hashlib
|
||||
from ..lock import get_lock
|
||||
from ..decorators import synchronized
|
||||
|
||||
_lock = get_lock("checksums")
|
||||
|
||||
class ChecksumInfo(object):
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize checksums and cache statistics."""
|
||||
# {hash -> [URL]}
|
||||
self.checksums = {}
|
||||
self.misses = self.hits = 0
|
||||
|
||||
def get_checksum_urls(self, url, checksum):
|
||||
"""Look for and store checksum for URL.
|
||||
@param url: the URL for the checksum
|
||||
@ptype url: unicode
|
||||
@param checksum: the URL content checksum
|
||||
@ptype checksum: str
|
||||
@return: list of URLs matching the given checksum (except the given URL)
|
||||
@rtype: list of unicode
|
||||
"""
|
||||
if checksum in self.checksums:
|
||||
self.hits += 1
|
||||
urls = self.checksums[checksum]
|
||||
if url in urls:
|
||||
res = [x for x in urls if x != url]
|
||||
else:
|
||||
res = urls[:]
|
||||
urls.append(url)
|
||||
else:
|
||||
self.misses += 1
|
||||
res = []
|
||||
self.checksums[checksum] = [url]
|
||||
return res
|
||||
|
||||
|
||||
_checksuminfo = ChecksumInfo()
|
||||
|
||||
@synchronized(_lock)
|
||||
def get_checksum_urls(url, content):
|
||||
"""See if given URL content is already stored under another URL.
|
||||
@param url: the URL for which the content is valid
|
||||
@ptype url: unicode
|
||||
@param content: the content to hash
|
||||
@ptype content: str
|
||||
@return: list of URLs with the same content (except the given URL)
|
||||
@rtype: list of unicode"""
|
||||
checksum = hashlib.sha1(content).hexdigest()
|
||||
return _checksuminfo.get_checksum_urls(url, checksum)
|
||||
|
||||
|
|
@ -80,6 +80,7 @@ WARN_URL_EFFECTIVE_URL = "url-effective-url"
|
|||
WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
|
||||
WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found"
|
||||
WARN_URL_WARNREGEX_FOUND = "url-warnregex-found"
|
||||
WARN_URL_CONTENT_DUPLICATE = "url-content-duplicate"
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
|
||||
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
|
||||
|
|
@ -115,6 +116,7 @@ Warnings = {
|
|||
WARN_URL_ANCHOR_NOT_FOUND: _("URL anchor was not found."),
|
||||
WARN_URL_WARNREGEX_FOUND:
|
||||
_("The warning regular expression was found in the URL contents."),
|
||||
WARN_URL_CONTENT_DUPLICATE: _("The URL content is a duplicate of another URL."),
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
|
||||
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
|
||||
|
|
|
|||
|
|
@ -220,7 +220,8 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
buf = StringIO()
|
||||
def stor_data (s):
|
||||
"""Helper method storing given data"""
|
||||
self.aggregate.add_download_bytes(len(s))
|
||||
urls = self.aggregate.add_download_data(self.cache_content_key, s)
|
||||
self.warn_duplicate_content(urls)
|
||||
# limit the download size
|
||||
if (buf.tell() + len(s)) > self.MaxFilesizeBytes:
|
||||
raise LinkCheckerError(_("FTP file size too large"))
|
||||
|
|
|
|||
|
|
@ -695,7 +695,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
read_content()"""
|
||||
data = response.read()
|
||||
self._size = len(data)
|
||||
self.aggregate.add_download_bytes(self._size)
|
||||
urls = self.aggregate.add_download_data(self.cache_content_key, data)
|
||||
self.warn_duplicate_content(urls)
|
||||
encoding = headers.get_content_encoding(self.headers)
|
||||
if encoding in _supported_encodings:
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ from .const import (WARN_URL_EFFECTIVE_URL,
|
|||
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
|
||||
WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH,
|
||||
WARN_URL_CONTENT_DUPLICATE,
|
||||
ExcList, ExcSyntaxList, ExcNoCacheList)
|
||||
|
||||
# helper alias
|
||||
|
|
@ -745,9 +746,19 @@ class UrlBase (object):
|
|||
raise LinkCheckerError(_("File size too large"))
|
||||
data = self.url_connection.read()
|
||||
if not self.is_local():
|
||||
self.aggregate.add_download_bytes(len(data))
|
||||
urls = self.aggregate.add_download_data(self.cache_content_key, data)
|
||||
self.warn_duplicate_content(urls)
|
||||
return data, len(data)
|
||||
|
||||
def warn_duplicate_content(self, urls):
|
||||
"""If given URL list is not empty, warn about duplicate URL content.
|
||||
@param urls: URLs with duplicate content
|
||||
@ptype urls: list of unicode
|
||||
"""
|
||||
if urls:
|
||||
args = dict(urls=u",".join(urls), size=strformat.strsize(self.size))
|
||||
self.add_warning(_("Content with %(size)s is the same as in URLs (%(urls)s).") % args, tag=WARN_URL_CONTENT_DUPLICATE)
|
||||
|
||||
def check_content (self):
|
||||
"""Check content data for warnings, syntax errors, viruses etc."""
|
||||
if not (self.valid and self.can_get_content()):
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ import time
|
|||
import threading
|
||||
from .. import log, LOG_CHECK
|
||||
from ..decorators import synchronized
|
||||
from ..cache import urlqueue, addrinfo
|
||||
from ..cache import urlqueue, addrinfo, content
|
||||
from . import logger, status, checker, cleanup
|
||||
|
||||
|
||||
|
|
@ -136,12 +136,17 @@ class Aggregate (object):
|
|||
self.last_w3_call = time.time()
|
||||
|
||||
@synchronized(_download_lock)
|
||||
def add_download_bytes(self, bytes):
|
||||
"""Add gibven bytes to number of downloaded bytes.
|
||||
@param bytes: number of bytes downloaded
|
||||
@ptype bytes: int
|
||||
def add_download_data(self, url, data):
|
||||
"""Add given downloaded data.
|
||||
@param url: URL which data belongs to
|
||||
@ptype url: unicode
|
||||
@param data: downloaded data
|
||||
@ptype data: string
|
||||
@return: URLs with duplicate contents
|
||||
@rtype: list of unicode
|
||||
"""
|
||||
self.downloaded_bytes += bytes
|
||||
self.downloaded_bytes += len(data)
|
||||
return content.get_checksum_urls(url, data)
|
||||
|
||||
def gather_statistics(self):
|
||||
"""Gather download and cache statistics and send them to the
|
||||
|
|
|
|||
|
|
@ -123,12 +123,14 @@ url /?d=directory&p=page1
|
|||
cache key http://localhost:%(port)d/?d=directory&p=page1
|
||||
real url http://localhost:%(port)d/?d=directory&p=page1
|
||||
name should not be cached
|
||||
warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page).
|
||||
valid
|
||||
|
||||
url /?quoted=ü
|
||||
cache key http://localhost:%(port)d/?quoted=%%C3%%BC
|
||||
real url http://localhost:%(port)d/?quoted=%%C3%%BC
|
||||
name html entities
|
||||
warning Content with 1KB is the same as in URLs (http://localhost:%(port)d/?d=directory&p=page,http://localhost:%(port)d/?d=directory&p=page1).
|
||||
valid
|
||||
|
||||
url clsid:12345-67890
|
||||
|
|
|
|||
|
|
@ -152,9 +152,14 @@ class TestMail (LinkCheckTest):
|
|||
self.mail_error(u"mailto:@")
|
||||
self.mail_error(u"mailto:@example.org")
|
||||
self.mail_error(u"mailto:a@")
|
||||
self.mail_error(u"mailto:%s@%s" % (u"a"*60, u"b"*200))
|
||||
url_too_long = "URL length %d is longer than 255."
|
||||
url = u"mailto:%s@%s" % (u"a"*60, u"b"*200)
|
||||
warning = url_too_long % len(url)
|
||||
self.mail_error(url, warning=warning)
|
||||
url = u"mailto:a@%s" % (u"a"*256)
|
||||
warning = url_too_long % len(url)
|
||||
self.mail_error(url, warning=warning)
|
||||
self.mail_error(u"mailto:%s@example.org" % (u"a"*65))
|
||||
self.mail_error(u"mailto:a@%s" % (u"a"*256))
|
||||
self.mail_error(u'mailto:a@%s.com' % (u"a"*64))
|
||||
# local part quoted
|
||||
self.mail_error(u'mailto:"a""@example.com', cache_key=u'mailto:a')
|
||||
|
|
|
|||
Loading…
Reference in a new issue