From bee8023540fdd1bd241efcea8638e2238dde331c Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Mon, 22 Feb 2010 01:06:19 +0100 Subject: [PATCH] Fixed URL encoding --- linkcheck/url.py | 5 ++++- tests/checker/data/http.html.result | 8 ++++---- tests/checker/data/misc.html | 3 --- tests/checker/data/misc.html.result | 6 ------ tests/checker/test_error.py | 18 ++++++++++++------ tests/checker/test_http.py | 1 - 6 files changed, 20 insertions(+), 21 deletions(-) diff --git a/linkcheck/url.py b/linkcheck/url.py index ee7463ba..ac69fa83 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -279,6 +279,9 @@ def url_norm (url, encoding=None): url = url.encode('ascii') except UnicodeEncodeError: pass + encode_unicode = True + else: + encode_unicode = False urlparts = list(urlparse.urlsplit(url)) # scheme urlparts[0] = urllib.unquote(urlparts[0]).lower() @@ -312,7 +315,7 @@ def url_norm (url, encoding=None): if url.endswith('#') and not urlparts[4]: # re-append trailing empty fragment res += '#' - if isinstance(url, unicode): + if encode_unicode: res = unicode(res) return (res, is_idn) diff --git a/tests/checker/data/http.html.result b/tests/checker/data/http.html.result index e85fef1f..58bd2bb4 100644 --- a/tests/checker/data/http.html.result +++ b/tests/checker/data/http.html.result @@ -121,11 +121,11 @@ real url http://localhost:8001/?d=directory&p=page1 name should not be cached valid -url http://localhost:8001/?quoted=ü -cache key http://localhost:8001/?quoted=%%FC -real url http://localhost:8001/?quoted=%%FC +url http://localhost:8001/?quoted=ü +cache key http://localhost:8001/?quoted=%%C3%%BC +real url http://localhost:8001/?quoted=%%C3%%BC name html entities -warning Base URL is not properly normed. Normed URL is http://localhost:8001/?quoted=%%FC. +warning Base URL is not properly normed. Normed URL is http://localhost:8001/?quoted=%%C3%%BC. valid url clsid:12345 diff --git a/tests/checker/data/misc.html b/tests/checker/data/misc.html index df7c1a54..59417caa 100644 --- a/tests/checker/data/misc.html +++ b/tests/checker/data/misc.html @@ -3,9 +3,6 @@ - -blubb - diff --git a/tests/checker/data/misc.html.result b/tests/checker/data/misc.html.result index 5fc4a2fc..feec03f3 100644 --- a/tests/checker/data/misc.html.result +++ b/tests/checker/data/misc.html.result @@ -19,12 +19,6 @@ cache key file://%(curdir)s/%(datadir)s/favicon.ico real url file://%(curdir)s/%(datadir)s/favicon.ico valid -url http://imadööfus.org%%0D%%3Cfont%%20face=%%22Verdana,%%20Arial,%%20Helvetica,%%20sans-serif%%22%%20size=%%222%%22%%3E%%3Chttp://www.imadoofus.org%%3E%%20%%0D%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20 -cache key None -real url -name blubb -error - url cache key None real url diff --git a/tests/checker/test_error.py b/tests/checker/test_error.py index 95cda92e..c0d74155 100644 --- a/tests/checker/test_error.py +++ b/tests/checker/test_error.py @@ -31,9 +31,10 @@ class TestError (LinkCheckTest): attrs = self.get_attrs(url=url) attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %(nurl)s" % attrs, + u"url file://%(curdir)s/%(url)s" % attrs, u"cache key %(nurl)s" % attrs, u"real url %(nurl)s" % attrs, + u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs, u"error", ] self.direct(url, resultlines) @@ -44,10 +45,11 @@ class TestError (LinkCheckTest): attrs = self.get_attrs(url=url) attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %(nurl)s" % attrs, + u"url file://%(curdir)s/%(url)s" % attrs, u"cache key %(nurl)s" % attrs, u"real url %(nurl)s" % attrs, u"name %(url)s" % attrs, + u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs, u"error", ] self.direct(url, resultlines) @@ -55,10 +57,11 @@ class TestError (LinkCheckTest): attrs = self.get_attrs(url=url) attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %(nurl)s" % attrs, + u"url file://%(curdir)s/%(url)s" % attrs, u"cache key %(nurl)s" % attrs, u"real url %(nurl)s" % attrs, u"name %(url)s" % attrs, + u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs, u"error", ] self.direct(url, resultlines) @@ -92,10 +95,11 @@ class TestError (LinkCheckTest): attrs = self.get_attrs(url=url) attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %(nurl)s" % attrs, + u"url file://%(curdir)s/%(url)s" % attrs, u"cache key %(nurl)s" % attrs, u"real url %(nurl)s" % attrs, u"name %(url)s" % attrs, + u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs, u"error", ] self.direct(url, resultlines) @@ -106,10 +110,11 @@ class TestError (LinkCheckTest): attrs = self.get_attrs(url=url) attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %(nurl)s" % attrs, + u"url file://%(curdir)s/%(url)s" % attrs, u"cache key %(nurl)s" % attrs, u"real url %(nurl)s" % attrs, u"name %(url)s" % attrs, + u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs, u"error", ] self.direct(url, resultlines) @@ -120,10 +125,11 @@ class TestError (LinkCheckTest): attrs = self.get_attrs(url=url) attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %(nurl)s" % attrs, + u"url file://%(curdir)s/%(url)s" % attrs, u"cache key %(nurl)s" % attrs, u"real url %(nurl)s" % attrs, u"name %(url)s" % attrs, + u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs, u"error", ] self.direct(url, resultlines) diff --git a/tests/checker/test_http.py b/tests/checker/test_http.py index 488c86f0..8e0dd04c 100644 --- a/tests/checker/test_http.py +++ b/tests/checker/test_http.py @@ -133,7 +133,6 @@ class TestHttp (httpserver.HttpServerTest): u"url http://www.example.org/", u"cache key http://www.example.org/", u"real url http://www.example.org/", - u"warning Access denied by robots.txt, checked only syntax.", u"valid", ] self.direct(url, resultlines, recursionlevel=1)