diff --git a/doc/changelog.txt b/doc/changelog.txt index 91e33ee5..4c5e7ca8 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -7,6 +7,9 @@ Fixes: Changes: - checking: Use HTTP GET requests to work around buggy IIS servers sending false positive status codes for HEAD requests. +- checking: Strip leading and trailing whitespace from URLs and print + a warning instead of having errors. + Closes: SF bug #3196918 6.4 "The Sunset Limited" (released 20.2.2011) diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 7f86d2e9..e5f0d3ef 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -71,12 +71,16 @@ def get_url_from (base_url, recursion_level, aggregate, """ if base_url is not None: base_url = strformat.unicode_safe(base_url) + # left strip for detection of URL scheme + base_url_stripped = base_url.lstrip() + else: + base_url_stripped = base_url if parent_url is not None: parent_url = strformat.unicode_safe(parent_url) if base_ref is not None: base_ref = strformat.unicode_safe(base_ref) name = strformat.unicode_safe(name) - url = absolute_url(base_url, base_ref, parent_url).lower() + url = absolute_url(base_url_stripped, base_ref, parent_url).lower() if not (url or name): # use filename as base url, with slash as path seperator name = base_url.replace("\\", "/") diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index ec654077..9850e782 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -79,6 +79,7 @@ WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large" WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero" WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal" WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip" +WARN_URL_WHITESPACE = "url-whitespace" WARN_FILE_MISSING_SLASH = "file-missing-slash" WARN_FILE_SYSTEM_PATH = "file-system-path" WARN_FTP_MISSING_SLASH = "ftp-missing-slash" @@ -109,6 +110,7 @@ Warnings = { WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."), WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."), WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."), + WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."), WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."), WARN_FILE_SYSTEM_PATH: _("The file: path is not the same as the system specific path."), diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 689d7120..44a13680 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -39,7 +39,8 @@ from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP, WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND, WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO, - WARN_URL_CONTENT_SIZE_UNEQUAL, ExcList, ExcSyntaxList, ExcNoCacheList) + WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE, + ExcList, ExcSyntaxList, ExcNoCacheList) # helper alias unicode_safe = strformat.unicode_safe @@ -105,9 +106,9 @@ class UrlBase (object): @param name: name of url or empty @param url_encoding: encoding of URL or None """ + self.reset() self.init(base_ref, base_url, parent_url, recursion_level, aggregate, line, column, name, url_encoding) - self.reset() self.check_syntax() def init (self, base_ref, base_url, parent_url, recursion_level, @@ -116,8 +117,10 @@ class UrlBase (object): Initialize internal data. """ self.base_ref = base_ref - # note that self.base_url must not be modified - self.base_url = base_url + self.base_url = base_url.strip() if base_url else base_url + if self.base_url != base_url: + self.add_warning(_("Leading or trailing whitespace in URL `%(url)s'.") % + {"url": base_url}, tag=WARN_URL_WHITESPACE) self.parent_url = parent_url self.recursion_level = recursion_level self.aggregate = aggregate diff --git a/tests/checker/test_error.py b/tests/checker/test_error.py index d3b8bb9a..4570ef31 100644 --- a/tests/checker/test_error.py +++ b/tests/checker/test_error.py @@ -38,52 +38,6 @@ class TestError (LinkCheckTest): ] self.direct(url, resultlines) - def test_leading_whitespace (self): - # Leading whitespace - url = u" http://www.heise.de/" - attrs = self.get_attrs(url=url) - attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) - resultlines = [ - u"url file://%(curdir)s/%(url)s" % attrs, - u"cache key %(nurl)s" % attrs, - u"real url %(nurl)s" % attrs, - u"name %(url)s" % attrs, - u"error", - ] - self.direct(url, resultlines) - url = u"\nhttp://www.heise.de/" - attrs = self.get_attrs(url=url) - attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) - resultlines = [ - u"url file://%(curdir)s/%(url)s" % attrs, - u"cache key %(nurl)s" % attrs, - u"real url %(nurl)s" % attrs, - u"name %(url)s" % attrs, - u"error", - ] - self.direct(url, resultlines) - - def test_trailing_whitespace (self): - # Trailing whitespace - url = u"http://www.heise.de/ " - nurl = self.norm(url) - resultlines = [ - u"url %s" % url, - u"cache key %s" % nurl, - u"real url %s" % nurl, - u"error", - ] - self.direct(url, resultlines) - url = u"http://www.heise.de/\n" - nurl = self.norm(url) - resultlines = [ - u"url %s" % url, - u"cache key %s" % nurl, - u"real url %s" % nurl, - u"error", - ] - self.direct(url, resultlines) - def test_invalid1 (self): # invalid scheme chars url = u"äöü?:" diff --git a/tests/checker/test_whitespace.py b/tests/checker/test_whitespace.py new file mode 100644 index 00000000..bbae0099 --- /dev/null +++ b/tests/checker/test_whitespace.py @@ -0,0 +1,74 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2004-2010 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Test whitespace handling. +""" +from . import LinkCheckTest + + +class TestWhitespace (LinkCheckTest): + """ + Test whitespace in URLs. + """ + + def test_leading_whitespace (self): + # Leading whitespace + url = u" http://www.heise.de/" + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm(url.strip()) + resultlines = [ + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, + u"warning Leading or trailing whitespace in URL `%(url)s'." % attrs, + u"valid", + ] + self.direct(url, resultlines) + url = u"\nhttp://www.heise.de/" + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm(url.strip()) + resultlines = [ + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, + u"warning Leading or trailing whitespace in URL `%(url)s'." % attrs, + u"valid", + ] + self.direct(url, resultlines) + + def test_trailing_whitespace (self): + # Trailing whitespace + url = u"http://www.heise.de/ " + nurl = self.norm(url.strip()) + resultlines = [ + u"url %s" % nurl, + u"cache key %s" % nurl, + u"real url %s" % nurl, + u"warning Leading or trailing whitespace in URL `%s'." % url, + u"valid", + ] + self.direct(url, resultlines) + url = u"http://www.heise.de/\n" + nurl = self.norm(url.strip()) + resultlines = [ + u"url %s" % nurl, + u"cache key %s" % nurl, + u"real url %s" % nurl, + u"warning Leading or trailing whitespace in URL `%s'." % url, + u"valid", + ] + self.direct(url, resultlines)