Warn about too long URLs.

This commit is contained in:
Bastian Kleineidam 2012-09-17 16:00:23 +02:00
parent 03667a4ec9
commit cb71f483a5
4 changed files with 80 additions and 7 deletions

View file

@ -8,6 +8,8 @@ Changes:
- doc: Mention 7-zip to extract the .tar.xz under Windows.
Closes: SF bug #3564733
- checking: Send a HTTP Do-Not-Track header.
- checking: Check URL length. Print error on URL longer than 2000 characters,
warning for longer than 255 characters.
- logging: Print download and cache statistics in text output logger.
- logging: Print warning tag in text output logger. Makes warning filtering
more easy.

View file

@ -71,6 +71,11 @@ except ImportError:
ExcList = ExcCacheList + ExcNoCacheList
# some constants
URL_MAX_LENGTH = 2000
URL_WARN_LENGTH = 255
# the warnings
WARN_URL_EFFECTIVE_URL = "url-effective-url"
WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found"
@ -79,6 +84,7 @@ WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip"
WARN_URL_TOO_LONG = "url-too-long"
WARN_URL_WHITESPACE = "url-whitespace"
WARN_FILE_MISSING_SLASH = "file-missing-slash"
WARN_FILE_SYSTEM_PATH = "file-system-path"
@ -112,6 +118,7 @@ Warnings = {
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."),
WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
WARN_FILE_SYSTEM_PATH:

View file

@ -40,6 +40,7 @@ from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH,
ExcList, ExcSyntaxList, ExcNoCacheList)
# helper alias
@ -389,19 +390,29 @@ class UrlBase (object):
return
try:
self.build_url()
# check url warnings
effectiveurl = urlutil.urlunsplit(self.urlparts)
if self.url != effectiveurl:
self.add_warning(_("Effective URL %(url)r.") %
{"url": effectiveurl},
tag=WARN_URL_EFFECTIVE_URL)
self.url = effectiveurl
self.check_url_warnings()
except tuple(ExcSyntaxList), msg:
self.set_result(unicode_safe(msg), valid=False)
else:
self.set_cache_keys()
self.set_extern(self.url)
def check_url_warnings(self):
"""Check URL name and length."""
effectiveurl = urlutil.urlunsplit(self.urlparts)
if self.url != effectiveurl:
self.add_warning(_("Effective URL %(url)r.") %
{"url": effectiveurl},
tag=WARN_URL_EFFECTIVE_URL)
self.url = effectiveurl
if len(self.url) > URL_MAX_LENGTH:
args = dict(len=len(self.url), max=URL_MAX_LENGTH)
self.set_result(_("URL length %(len)d is longer than maximum of %(max)d.") % args, valid=False)
elif len(self.url) > URL_WARN_LENGTH:
args = dict(len=len(self.url), warn=URL_WARN_LENGTH)
self.add_warning(_("URL length %(len)d is longer than %(warn)d.") % args,
tag=WARN_URL_TOO_LONG)
def build_url (self):
"""
Construct self.url and self.urlparts out of the given base

View file

@ -0,0 +1,53 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test URL length checks.
"""
from . import LinkCheckTest
class TestURLLength(LinkCheckTest):
"""
Test URL lengths.
"""
def test_url_warn(self):
url = u"http://www.example.org/%s" % (u"a" * 256)
attrs = self.get_attrs(url=url)
attrs['nurl'] = u"http://www.iana.org/domains/example/"
resultlines = [
u"url %(url)s" % attrs,
u"cache key %(url)s" % attrs,
u"real url %(nurl)s" % attrs,
u"info Redirected to `%(nurl)s'." % attrs,
u"warning URL length 279 is longer than 255.",
u"valid",
]
self.direct(url, resultlines)
def test_url_error(self):
url = u"http://www.example.org/%s" % ("a" * 2000)
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm(url)
resultlines = [
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"error",
]
self.direct(url, resultlines)