diff --git a/doc/changelog.txt b/doc/changelog.txt index 43bbd644..cd1d3e7c 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -8,6 +8,8 @@ Changes: - doc: Mention 7-zip to extract the .tar.xz under Windows. Closes: SF bug #3564733 - checking: Send a HTTP Do-Not-Track header. +- checking: Check URL length. Print error on URL longer than 2000 characters, + warning for longer than 255 characters. - logging: Print download and cache statistics in text output logger. - logging: Print warning tag in text output logger. Makes warning filtering more easy. diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index db9acaed..9600f6d4 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -71,6 +71,11 @@ except ImportError: ExcList = ExcCacheList + ExcNoCacheList +# some constants +URL_MAX_LENGTH = 2000 +URL_WARN_LENGTH = 255 + +# the warnings WARN_URL_EFFECTIVE_URL = "url-effective-url" WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content" WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found" @@ -79,6 +84,7 @@ WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large" WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero" WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal" WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip" +WARN_URL_TOO_LONG = "url-too-long" WARN_URL_WHITESPACE = "url-whitespace" WARN_FILE_MISSING_SLASH = "file-missing-slash" WARN_FILE_SYSTEM_PATH = "file-system-path" @@ -112,6 +118,7 @@ Warnings = { WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."), WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."), WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."), + WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."), WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."), WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."), WARN_FILE_SYSTEM_PATH: diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 7b631f9c..f44fc869 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -40,6 +40,7 @@ from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND, WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE, + WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH, ExcList, ExcSyntaxList, ExcNoCacheList) # helper alias @@ -389,19 +390,29 @@ class UrlBase (object): return try: self.build_url() - # check url warnings - effectiveurl = urlutil.urlunsplit(self.urlparts) - if self.url != effectiveurl: - self.add_warning(_("Effective URL %(url)r.") % - {"url": effectiveurl}, - tag=WARN_URL_EFFECTIVE_URL) - self.url = effectiveurl + self.check_url_warnings() except tuple(ExcSyntaxList), msg: self.set_result(unicode_safe(msg), valid=False) else: self.set_cache_keys() self.set_extern(self.url) + def check_url_warnings(self): + """Check URL name and length.""" + effectiveurl = urlutil.urlunsplit(self.urlparts) + if self.url != effectiveurl: + self.add_warning(_("Effective URL %(url)r.") % + {"url": effectiveurl}, + tag=WARN_URL_EFFECTIVE_URL) + self.url = effectiveurl + if len(self.url) > URL_MAX_LENGTH: + args = dict(len=len(self.url), max=URL_MAX_LENGTH) + self.set_result(_("URL length %(len)d is longer than maximum of %(max)d.") % args, valid=False) + elif len(self.url) > URL_WARN_LENGTH: + args = dict(len=len(self.url), warn=URL_WARN_LENGTH) + self.add_warning(_("URL length %(len)d is longer than %(warn)d.") % args, + tag=WARN_URL_TOO_LONG) + def build_url (self): """ Construct self.url and self.urlparts out of the given base diff --git a/tests/checker/test_urllen.py b/tests/checker/test_urllen.py new file mode 100644 index 00000000..e19dcef5 --- /dev/null +++ b/tests/checker/test_urllen.py @@ -0,0 +1,53 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2012 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Test URL length checks. +""" +from . import LinkCheckTest + + +class TestURLLength(LinkCheckTest): + """ + Test URL lengths. + """ + + def test_url_warn(self): + url = u"http://www.example.org/%s" % (u"a" * 256) + attrs = self.get_attrs(url=url) + attrs['nurl'] = u"http://www.iana.org/domains/example/" + resultlines = [ + u"url %(url)s" % attrs, + u"cache key %(url)s" % attrs, + u"real url %(nurl)s" % attrs, + u"info Redirected to `%(nurl)s'." % attrs, + u"warning URL length 279 is longer than 255.", + u"valid", + ] + self.direct(url, resultlines) + + def test_url_error(self): + url = u"http://www.example.org/%s" % ("a" * 2000) + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm(url) + resultlines = [ + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, + u"error", + ] + self.direct(url, resultlines) +