Strip leading and trailing whitespace from URLs.

This commit is contained in:
Bastian Kleineidam 2011-03-07 12:33:09 +01:00
parent 633206342b
commit 420c21c2de
6 changed files with 91 additions and 51 deletions

View file

@ -7,6 +7,9 @@ Fixes:
Changes:
- checking: Use HTTP GET requests to work around buggy IIS servers
sending false positive status codes for HEAD requests.
- checking: Strip leading and trailing whitespace from URLs and print
a warning instead of having errors.
Closes: SF bug #3196918
6.4 "The Sunset Limited" (released 20.2.2011)

View file

@ -71,12 +71,16 @@ def get_url_from (base_url, recursion_level, aggregate,
"""
if base_url is not None:
base_url = strformat.unicode_safe(base_url)
# left strip for detection of URL scheme
base_url_stripped = base_url.lstrip()
else:
base_url_stripped = base_url
if parent_url is not None:
parent_url = strformat.unicode_safe(parent_url)
if base_ref is not None:
base_ref = strformat.unicode_safe(base_ref)
name = strformat.unicode_safe(name)
url = absolute_url(base_url, base_ref, parent_url).lower()
url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
if not (url or name):
# use filename as base url, with slash as path seperator
name = base_url.replace("\\", "/")

View file

@ -79,6 +79,7 @@ WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip"
WARN_URL_WHITESPACE = "url-whitespace"
WARN_FILE_MISSING_SLASH = "file-missing-slash"
WARN_FILE_SYSTEM_PATH = "file-system-path"
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
@ -109,6 +110,7 @@ Warnings = {
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
WARN_FILE_SYSTEM_PATH:
_("The file: path is not the same as the system specific path."),

View file

@ -39,7 +39,8 @@ from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
WARN_URL_CONTENT_SIZE_UNEQUAL, ExcList, ExcSyntaxList, ExcNoCacheList)
WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
ExcList, ExcSyntaxList, ExcNoCacheList)
# helper alias
unicode_safe = strformat.unicode_safe
@ -105,9 +106,9 @@ class UrlBase (object):
@param name: name of url or empty
@param url_encoding: encoding of URL or None
"""
self.reset()
self.init(base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name, url_encoding)
self.reset()
self.check_syntax()
def init (self, base_ref, base_url, parent_url, recursion_level,
@ -116,8 +117,10 @@ class UrlBase (object):
Initialize internal data.
"""
self.base_ref = base_ref
# note that self.base_url must not be modified
self.base_url = base_url
self.base_url = base_url.strip() if base_url else base_url
if self.base_url != base_url:
self.add_warning(_("Leading or trailing whitespace in URL `%(url)s'.") %
{"url": base_url}, tag=WARN_URL_WHITESPACE)
self.parent_url = parent_url
self.recursion_level = recursion_level
self.aggregate = aggregate

View file

@ -38,52 +38,6 @@ class TestError (LinkCheckTest):
]
self.direct(url, resultlines)
def test_leading_whitespace (self):
# Leading whitespace
url = u" http://www.heise.de/"
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"name %(url)s" % attrs,
u"error",
]
self.direct(url, resultlines)
url = u"\nhttp://www.heise.de/"
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"name %(url)s" % attrs,
u"error",
]
self.direct(url, resultlines)
def test_trailing_whitespace (self):
# Trailing whitespace
url = u"http://www.heise.de/ "
nurl = self.norm(url)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"error",
]
self.direct(url, resultlines)
url = u"http://www.heise.de/\n"
nurl = self.norm(url)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"error",
]
self.direct(url, resultlines)
def test_invalid1 (self):
# invalid scheme chars
url = u"äöü?:"

View file

@ -0,0 +1,74 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test whitespace handling.
"""
from . import LinkCheckTest
class TestWhitespace (LinkCheckTest):
"""
Test whitespace in URLs.
"""
def test_leading_whitespace (self):
# Leading whitespace
url = u" http://www.heise.de/"
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm(url.strip())
resultlines = [
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"warning Leading or trailing whitespace in URL `%(url)s'." % attrs,
u"valid",
]
self.direct(url, resultlines)
url = u"\nhttp://www.heise.de/"
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm(url.strip())
resultlines = [
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"warning Leading or trailing whitespace in URL `%(url)s'." % attrs,
u"valid",
]
self.direct(url, resultlines)
def test_trailing_whitespace (self):
# Trailing whitespace
url = u"http://www.heise.de/ "
nurl = self.norm(url.strip())
resultlines = [
u"url %s" % nurl,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"warning Leading or trailing whitespace in URL `%s'." % url,
u"valid",
]
self.direct(url, resultlines)
url = u"http://www.heise.de/\n"
nurl = self.norm(url.strip())
resultlines = [
u"url %s" % nurl,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"warning Leading or trailing whitespace in URL `%s'." % url,
u"valid",
]
self.direct(url, resultlines)