fix url part quoting

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2853 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-10-11 14:08:14 +00:00
parent 2da64736ba
commit 8953766ccd
2 changed files with 28 additions and 13 deletions

View file

@ -361,6 +361,11 @@ class TestUrl (unittest.TestCase):
nurl = url
self.assertEqual(url_norm(url), nurl)
def test_norm_invalid (self):
url = u"äöü?:"
nurl = u"%E4%F6%FC?:"
self.assertEqual(url_norm(url), nurl)
def test_fixing (self):
"""
Test url fix method.

View file

@ -276,9 +276,9 @@ def url_parse_query (query):
append = ""
l = []
for k, v, sep in parse_qsl(query, True):
k = urllib.quote(k, '/-:,;')
k = url_quote_part(k, '/-:,;')
if v:
v = urllib.quote(v, '/-:,;')
v = url_quote_part(v, '/-:,;')
l.append("%s=%s%s" % (k, v, sep))
elif v is None:
l.append("%s%s" % (k, sep))
@ -319,9 +319,9 @@ def url_norm (url):
# fix redundant path parts
urlparts[2] = collapse_segments(urlparts[2])
# quote parts again
urlparts[0] = urllib.quote(urlparts[0]) # scheme
urlparts[1] = urllib.quote(urlparts[1], '@:') # host
urlparts[2] = urllib.quote(urlparts[2], _nopathquote_chars) # path
urlparts[0] = url_quote_part(urlparts[0]) # scheme
urlparts[1] = url_quote_part(urlparts[1], '@:') # host
urlparts[2] = url_quote_part(urlparts[2], _nopathquote_chars) # path
res = urlparse.urlunsplit(urlparts)
if url.endswith('#') and not urlparts[4]:
# re-append trailing empty fragment
@ -381,29 +381,39 @@ def url_quote (url):
if not url_is_absolute(url):
return document_quote(url)
urlparts = list(urlparse.urlsplit(url))
urlparts[0] = urllib.quote(urlparts[0]) # scheme
urlparts[1] = urllib.quote(urlparts[1], ':') # host
urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
urlparts[3] = urllib.quote(urlparts[3], '&=,') # query
urlparts[0] = url_quote_part(urlparts[0]) # scheme
urlparts[1] = url_quote_part(urlparts[1], ':') # host
urlparts[2] = url_quote_part(urlparts[2], '/=,') # path
urlparts[3] = url_quote_part(urlparts[3], '&=,') # query
l = []
for k, v, sep in parse_qsl(urlparts[3], True): # query
k = urllib.quote(k, '/-:,')
k = url_quote_part(k, '/-:,')
if v:
v = urllib.quote(v, '/-:,')
v = url_quote_part(v, '/-:,')
l.append("%s=%s%s" % (k, v, sep))
else:
l.append("%s%s" % (k, sep))
urlparts[3] = ''.join(l)
urlparts[4] = urllib.quote(urlparts[4]) # anchor
urlparts[4] = url_quote_part(urlparts[4]) # anchor
return urlparse.urlunsplit(urlparts)
def url_quote_part (s, safechars='/'):
"""
Wrap urllib.quote() to support unicode strings. A unicode string
is first converted to ISO-8859-1, invalid characters are ignored.
After that urllib.quote() is called.
"""
if isinstance(s, unicode):
s = s.encode("iso-8859-1", "ignore")
return urllib.quote(s, safechars)
def document_quote (document):
"""
Quote given document.
"""
doc, query = urllib.splitquery(document)
doc = urllib.quote(doc, '/=,')
doc = url_quote_part(doc, '/=,')
if query:
return "%s?%s" % (doc, query)
return doc