mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-17 06:20:27 +00:00
Python3: decode parts before submitting them to urllib.quote()
This commit is contained in:
parent
1c2e6c465e
commit
a6643034fb
1 changed files with 17 additions and 10 deletions
|
|
@ -92,6 +92,16 @@ is_safe_query = re.compile("(?i)^%s$" % _safe_query_pattern).match
|
|||
is_safe_fragment = re.compile("(?i)^%s$" % _safe_fragment_pattern).match
|
||||
|
||||
|
||||
def decode_for_unquote(part):
|
||||
"""
|
||||
Decode string for unquote function
|
||||
To string in Python 3, leave it in Python 2
|
||||
"""
|
||||
if not isinstance(part, (str, str_text)):
|
||||
# Python 3: we probably got bytes
|
||||
part = part.decode("utf-8", "replace")
|
||||
return part
|
||||
|
||||
# snatched form urlparse.py
|
||||
def splitparams (path):
|
||||
"""Split off parameter part from path.
|
||||
|
|
@ -192,9 +202,9 @@ def idna_encode (host):
|
|||
def url_fix_host (urlparts):
|
||||
"""Unquote and fix hostname. Returns is_idn."""
|
||||
if not urlparts[1]:
|
||||
urlparts[2] = parse.unquote(urlparts[2])
|
||||
urlparts[2] = parse.unquote(decode_for_unquote(urlparts[2]))
|
||||
return False
|
||||
userpass, netloc = parse.splituser(urlparts[1])
|
||||
userpass, netloc = parse.splituser(decode_for_unquote(urlparts[1]))
|
||||
if userpass:
|
||||
userpass = parse.unquote(userpass)
|
||||
netloc, is_idn = idna_encode(parse.unquote(netloc).lower())
|
||||
|
|
@ -208,7 +218,7 @@ def url_fix_host (urlparts):
|
|||
if not urlparts[2] or urlparts[2] == '/':
|
||||
urlparts[2] = comps
|
||||
else:
|
||||
urlparts[2] = "%s%s" % (comps, parse.unquote(urlparts[2]))
|
||||
urlparts[2] = "%s%s" % (comps, parse.unquote(decode_for_unquote(urlparts[2])))
|
||||
netloc = netloc[:i]
|
||||
else:
|
||||
# a leading ? in path causes urlsplit() to add the query to the
|
||||
|
|
@ -217,7 +227,7 @@ def url_fix_host (urlparts):
|
|||
if i != -1:
|
||||
netloc, urlparts[3] = netloc.split('?', 1)
|
||||
# path
|
||||
urlparts[2] = parse.unquote(urlparts[2])
|
||||
urlparts[2] = parse.unquote(decode_for_unquote(urlparts[2]))
|
||||
if userpass:
|
||||
# append AT for easy concatenation
|
||||
userpass += "@"
|
||||
|
|
@ -259,12 +269,9 @@ def url_fix_wayback_query(path):
|
|||
|
||||
def url_parse_query (query, encoding=None):
|
||||
"""Parse and re-join the given CGI query."""
|
||||
if isinstance(query, str_text):
|
||||
if encoding is None:
|
||||
encoding = url_encoding
|
||||
query = query.encode(encoding, 'ignore')
|
||||
# if ? is in the query, split it off, seen at msdn.microsoft.com
|
||||
append = ""
|
||||
query = decode_for_unquote(query)
|
||||
while '?' in query:
|
||||
query, rest = query.rsplit('?', 1)
|
||||
append = '?'+url_parse_query(rest)+append
|
||||
|
|
@ -314,7 +321,7 @@ def url_norm (url, encoding=None):
|
|||
encode_unicode = False
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
# scheme
|
||||
urlparts[0] = parse.unquote(urlparts[0]).lower()
|
||||
urlparts[0] = parse.unquote(decode_for_unquote(urlparts[0])).lower()
|
||||
# mailto: urlsplit is broken
|
||||
if urlparts[0] == 'mailto':
|
||||
url_fix_mailto_urlsplit(urlparts)
|
||||
|
|
@ -334,7 +341,7 @@ def url_norm (url, encoding=None):
|
|||
# fix redundant path parts
|
||||
urlparts[2] = collapse_segments(urlparts[2])
|
||||
# anchor
|
||||
urlparts[4] = parse.unquote(urlparts[4])
|
||||
urlparts[4] = parse.unquote(decode_for_unquote(urlparts[4]))
|
||||
# quote parts again
|
||||
urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
|
||||
urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
|
||||
|
|
|
|||
Loading…
Reference in a new issue