mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-18 03:21:07 +00:00
Strip leading and trailing whitespace from URLs.
This commit is contained in:
parent
633206342b
commit
420c21c2de
6 changed files with 91 additions and 51 deletions
|
|
@ -7,6 +7,9 @@ Fixes:
|
|||
Changes:
|
||||
- checking: Use HTTP GET requests to work around buggy IIS servers
|
||||
sending false positive status codes for HEAD requests.
|
||||
- checking: Strip leading and trailing whitespace from URLs and print
|
||||
a warning instead of having errors.
|
||||
Closes: SF bug #3196918
|
||||
|
||||
|
||||
6.4 "The Sunset Limited" (released 20.2.2011)
|
||||
|
|
|
|||
|
|
@ -71,12 +71,16 @@ def get_url_from (base_url, recursion_level, aggregate,
|
|||
"""
|
||||
if base_url is not None:
|
||||
base_url = strformat.unicode_safe(base_url)
|
||||
# left strip for detection of URL scheme
|
||||
base_url_stripped = base_url.lstrip()
|
||||
else:
|
||||
base_url_stripped = base_url
|
||||
if parent_url is not None:
|
||||
parent_url = strformat.unicode_safe(parent_url)
|
||||
if base_ref is not None:
|
||||
base_ref = strformat.unicode_safe(base_ref)
|
||||
name = strformat.unicode_safe(name)
|
||||
url = absolute_url(base_url, base_ref, parent_url).lower()
|
||||
url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
|
||||
if not (url or name):
|
||||
# use filename as base url, with slash as path seperator
|
||||
name = base_url.replace("\\", "/")
|
||||
|
|
|
|||
|
|
@ -79,6 +79,7 @@ WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
|
|||
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
|
||||
WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip"
|
||||
WARN_URL_WHITESPACE = "url-whitespace"
|
||||
WARN_FILE_MISSING_SLASH = "file-missing-slash"
|
||||
WARN_FILE_SYSTEM_PATH = "file-system-path"
|
||||
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
|
||||
|
|
@ -109,6 +110,7 @@ Warnings = {
|
|||
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
|
||||
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
|
||||
WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
|
||||
WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
|
||||
WARN_FILE_SYSTEM_PATH:
|
||||
_("The file: path is not the same as the system specific path."),
|
||||
|
|
|
|||
|
|
@ -39,7 +39,8 @@ from .const import (WARN_URL_EFFECTIVE_URL,
|
|||
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
|
||||
WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL, ExcList, ExcSyntaxList, ExcNoCacheList)
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
|
||||
ExcList, ExcSyntaxList, ExcNoCacheList)
|
||||
|
||||
# helper alias
|
||||
unicode_safe = strformat.unicode_safe
|
||||
|
|
@ -105,9 +106,9 @@ class UrlBase (object):
|
|||
@param name: name of url or empty
|
||||
@param url_encoding: encoding of URL or None
|
||||
"""
|
||||
self.reset()
|
||||
self.init(base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, name, url_encoding)
|
||||
self.reset()
|
||||
self.check_syntax()
|
||||
|
||||
def init (self, base_ref, base_url, parent_url, recursion_level,
|
||||
|
|
@ -116,8 +117,10 @@ class UrlBase (object):
|
|||
Initialize internal data.
|
||||
"""
|
||||
self.base_ref = base_ref
|
||||
# note that self.base_url must not be modified
|
||||
self.base_url = base_url
|
||||
self.base_url = base_url.strip() if base_url else base_url
|
||||
if self.base_url != base_url:
|
||||
self.add_warning(_("Leading or trailing whitespace in URL `%(url)s'.") %
|
||||
{"url": base_url}, tag=WARN_URL_WHITESPACE)
|
||||
self.parent_url = parent_url
|
||||
self.recursion_level = recursion_level
|
||||
self.aggregate = aggregate
|
||||
|
|
|
|||
|
|
@ -38,52 +38,6 @@ class TestError (LinkCheckTest):
|
|||
]
|
||||
self.direct(url, resultlines)
|
||||
|
||||
def test_leading_whitespace (self):
|
||||
# Leading whitespace
|
||||
url = u" http://www.heise.de/"
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url file://%(curdir)s/%(url)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"name %(url)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
url = u"\nhttp://www.heise.de/"
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url file://%(curdir)s/%(url)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"name %(url)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
||||
def test_trailing_whitespace (self):
|
||||
# Trailing whitespace
|
||||
url = u"http://www.heise.de/ "
|
||||
nurl = self.norm(url)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
url = u"http://www.heise.de/\n"
|
||||
nurl = self.norm(url)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
||||
def test_invalid1 (self):
|
||||
# invalid scheme chars
|
||||
url = u"äöü?:"
|
||||
|
|
|
|||
74
tests/checker/test_whitespace.py
Normal file
74
tests/checker/test_whitespace.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2010 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Test whitespace handling.
|
||||
"""
|
||||
from . import LinkCheckTest
|
||||
|
||||
|
||||
class TestWhitespace (LinkCheckTest):
|
||||
"""
|
||||
Test whitespace in URLs.
|
||||
"""
|
||||
|
||||
def test_leading_whitespace (self):
|
||||
# Leading whitespace
|
||||
url = u" http://www.heise.de/"
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm(url.strip())
|
||||
resultlines = [
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"warning Leading or trailing whitespace in URL `%(url)s'." % attrs,
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
url = u"\nhttp://www.heise.de/"
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm(url.strip())
|
||||
resultlines = [
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"warning Leading or trailing whitespace in URL `%(url)s'." % attrs,
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
||||
def test_trailing_whitespace (self):
|
||||
# Trailing whitespace
|
||||
url = u"http://www.heise.de/ "
|
||||
nurl = self.norm(url.strip())
|
||||
resultlines = [
|
||||
u"url %s" % nurl,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"warning Leading or trailing whitespace in URL `%s'." % url,
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
url = u"http://www.heise.de/\n"
|
||||
nurl = self.norm(url.strip())
|
||||
resultlines = [
|
||||
u"url %s" % nurl,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"warning Leading or trailing whitespace in URL `%s'." % url,
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
Loading…
Reference in a new issue