handle leading query exclam after host without path

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1981 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-11-14 11:12:16 +00:00
parent 265f942ba1
commit b150ff6ea8

View file

@ -169,16 +169,8 @@ def idna_encode (host):
return host, False
def url_norm (url):
"""Fix and normalize URL which must be quoted. Supports unicode
hostnames (IDNA encoding) according to RFC 3490.
@return (normed url, idna flag)
"""
urlparts = list(urlparse.urlsplit(url))
# scheme
urlparts[0] = urllib.unquote(urlparts[0]).lower()
# host
def url_fix_host (urlparts):
"""Unquote and fix hostname. Returns is_idn."""
urlparts[1], is_idn = idna_encode(urllib.unquote(urlparts[1]).lower())
# a leading backslash in path causes urlsplit() to add the
# path components up to the first slash to host
@ -193,7 +185,13 @@ def url_norm (url):
urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2]))
urlparts[1] = urlparts[1][:i]
else:
urlparts[2] = urllib.unquote(urlparts[2]) # path
# a leading ? in path causes urlsplit() to add the query to the
# host name
i = urlparts[1].find("?")
if i != -1:
urlparts[1], urlparts[3] = urlparts[1].split('?', 1)
# path
urlparts[2] = urllib.unquote(urlparts[2])
if urlparts[1]:
userpass, host = urllib.splituser(urlparts[1])
if userpass:
@ -210,8 +208,22 @@ def url_norm (url):
urlparts[1] = userpass+host
else:
urlparts[1] = "%s%s:%d" % (userpass, host, port)
def url_norm (url):
"""Fix and normalize URL which must be quoted. Supports unicode
hostnames (IDNA encoding) according to RFC 3490.
@return (normed url, idna flag)
"""
urlparts = list(urlparse.urlsplit(url))
# scheme
urlparts[0] = urllib.unquote(urlparts[0]).lower()
# host (with path or query side effects)
is_idn = url_fix_host(urlparts)
# query
l = []
for k, v in parse_qsl(urlparts[3], True): # query
for k, v in parse_qsl(urlparts[3], True):
k = urllib.quote(k, '/-:,')
if v:
v = urllib.quote(v, '/-:,')