Fixed URL encoding

This commit is contained in:
Bastian Kleineidam 2010-02-22 01:06:19 +01:00
parent 9bc4772ba4
commit bee8023540
6 changed files with 20 additions and 21 deletions

View file

@ -279,6 +279,9 @@ def url_norm (url, encoding=None):
url = url.encode('ascii')
except UnicodeEncodeError:
pass
encode_unicode = True
else:
encode_unicode = False
urlparts = list(urlparse.urlsplit(url))
# scheme
urlparts[0] = urllib.unquote(urlparts[0]).lower()
@ -312,7 +315,7 @@ def url_norm (url, encoding=None):
if url.endswith('#') and not urlparts[4]:
# re-append trailing empty fragment
res += '#'
if isinstance(url, unicode):
if encode_unicode:
res = unicode(res)
return (res, is_idn)

View file

@ -121,11 +121,11 @@ real url http://localhost:8001/?d=directory&p=page1
name should not be cached
valid
url http://localhost:8001/?quoted=ü
cache key http://localhost:8001/?quoted=%%FC
real url http://localhost:8001/?quoted=%%FC
url http://localhost:8001/?quoted=ü
cache key http://localhost:8001/?quoted=%%C3%%BC
real url http://localhost:8001/?quoted=%%C3%%BC
name html entities
warning Base URL is not properly normed. Normed URL is http://localhost:8001/?quoted=%%FC.
warning Base URL is not properly normed. Normed URL is http://localhost:8001/?quoted=%%C3%%BC.
valid
url clsid:12345

View file

@ -3,9 +3,6 @@
<meta rel="SHORTCUT ICON" href="favicon.ico">
<meta rel="ICON" href="favicon.ico">
<!-- unparsable domain name -->
<a href="http://imadööfus.org%0D%3Cfont%20face=%22Verdana,%20Arial,%20Helvetica,%20sans-serif%22%20size=%222%22%3E%3Chttp://www.imadoofus.org%3E%20%0D%20%20%20%20%20%20%20%20%20%20%20%20%20%20">blubb</a>
<!-- empty tag -->
<tr background>

View file

@ -19,12 +19,6 @@ cache key file://%(curdir)s/%(datadir)s/favicon.ico
real url file://%(curdir)s/%(datadir)s/favicon.ico
valid
url http://imadööfus.org%%0D%%3Cfont%%20face=%%22Verdana,%%20Arial,%%20Helvetica,%%20sans-serif%%22%%20size=%%222%%22%%3E%%3Chttp://www.imadoofus.org%%3E%%20%%0D%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20%%20
cache key None
real url
name blubb
error
url
cache key None
real url

View file

@ -31,9 +31,10 @@ class TestError (LinkCheckTest):
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %(nurl)s" % attrs,
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs,
u"error",
]
self.direct(url, resultlines)
@ -44,10 +45,11 @@ class TestError (LinkCheckTest):
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %(nurl)s" % attrs,
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"name %(url)s" % attrs,
u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs,
u"error",
]
self.direct(url, resultlines)
@ -55,10 +57,11 @@ class TestError (LinkCheckTest):
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %(nurl)s" % attrs,
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"name %(url)s" % attrs,
u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs,
u"error",
]
self.direct(url, resultlines)
@ -92,10 +95,11 @@ class TestError (LinkCheckTest):
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %(nurl)s" % attrs,
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"name %(url)s" % attrs,
u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs,
u"error",
]
self.direct(url, resultlines)
@ -106,10 +110,11 @@ class TestError (LinkCheckTest):
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %(nurl)s" % attrs,
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"name %(url)s" % attrs,
u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs,
u"error",
]
self.direct(url, resultlines)
@ -120,10 +125,11 @@ class TestError (LinkCheckTest):
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %(nurl)s" % attrs,
u"url file://%(curdir)s/%(url)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"name %(url)s" % attrs,
u"warning Base URL is not properly normed. Normed URL is %(nurl)s." % attrs,
u"error",
]
self.direct(url, resultlines)

View file

@ -133,7 +133,6 @@ class TestHttp (httpserver.HttpServerTest):
u"url http://www.example.org/",
u"cache key http://www.example.org/",
u"real url http://www.example.org/",
u"warning Access denied by robots.txt, checked only syntax.",
u"valid",
]
self.direct(url, resultlines, recursionlevel=1)