From b150ff6ea81e32a0bc12eccee133fa63c4dfc158 Mon Sep 17 00:00:00 2001 From: calvin Date: Sun, 14 Nov 2004 11:12:16 +0000 Subject: [PATCH] handle leading query exclam after host without path git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1981 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/url.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/linkcheck/url.py b/linkcheck/url.py index cd1ac6a6..a509a327 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -169,16 +169,8 @@ def idna_encode (host): return host, False -def url_norm (url): - """Fix and normalize URL which must be quoted. Supports unicode - hostnames (IDNA encoding) according to RFC 3490. - - @return (normed url, idna flag) - """ - urlparts = list(urlparse.urlsplit(url)) - # scheme - urlparts[0] = urllib.unquote(urlparts[0]).lower() - # host +def url_fix_host (urlparts): + """Unquote and fix hostname. Returns is_idn.""" urlparts[1], is_idn = idna_encode(urllib.unquote(urlparts[1]).lower()) # a leading backslash in path causes urlsplit() to add the # path components up to the first slash to host @@ -193,7 +185,13 @@ def url_norm (url): urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2])) urlparts[1] = urlparts[1][:i] else: - urlparts[2] = urllib.unquote(urlparts[2]) # path + # a leading ? in path causes urlsplit() to add the query to the + # host name + i = urlparts[1].find("?") + if i != -1: + urlparts[1], urlparts[3] = urlparts[1].split('?', 1) + # path + urlparts[2] = urllib.unquote(urlparts[2]) if urlparts[1]: userpass, host = urllib.splituser(urlparts[1]) if userpass: @@ -210,8 +208,22 @@ def url_norm (url): urlparts[1] = userpass+host else: urlparts[1] = "%s%s:%d" % (userpass, host, port) + + +def url_norm (url): + """Fix and normalize URL which must be quoted. Supports unicode + hostnames (IDNA encoding) according to RFC 3490. + + @return (normed url, idna flag) + """ + urlparts = list(urlparse.urlsplit(url)) + # scheme + urlparts[0] = urllib.unquote(urlparts[0]).lower() + # host (with path or query side effects) + is_idn = url_fix_host(urlparts) + # query l = [] - for k, v in parse_qsl(urlparts[3], True): # query + for k, v in parse_qsl(urlparts[3], True): k = urllib.quote(k, '/-:,') if v: v = urllib.quote(v, '/-:,')