Python3: decode parts before submitting them to urllib.quote()

2026-05-07 22:24:45 +00:00 · 2018-01-05 19:36:52 +01:00 · 2018-01-05 19:36:52 +01:00 · a6643034fb
commit a6643034fb
parent 1c2e6c465e
1 changed files with 17 additions and 10 deletions
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -92,6 +92,16 @@ is_safe_query = re.compile("(?i)^%s$" % _safe_query_pattern).match
 is_safe_fragment = re.compile("(?i)^%s$" % _safe_fragment_pattern).match


+def decode_for_unquote(part):
+    """
+    Decode string for unquote function
+    To string in Python 3, leave it in Python 2
+    """
+    if not isinstance(part, (str, str_text)):
+        # Python 3: we probably got bytes
+        part = part.decode("utf-8", "replace")
+    return part
+
 # snatched form urlparse.py
 def splitparams (path):
    """Split off parameter part from path.
@ -192,9 +202,9 @@ def idna_encode (host):
 def url_fix_host (urlparts):
    """Unquote and fix hostname. Returns is_idn."""
    if not urlparts[1]:
-        urlparts[2] = parse.unquote(urlparts[2])
+        urlparts[2] = parse.unquote(decode_for_unquote(urlparts[2]))
        return False
-    userpass, netloc = parse.splituser(urlparts[1])
+    userpass, netloc = parse.splituser(decode_for_unquote(urlparts[1]))
    if userpass:
        userpass = parse.unquote(userpass)
    netloc, is_idn = idna_encode(parse.unquote(netloc).lower())
@ -208,7 +218,7 @@ def url_fix_host (urlparts):
        if not urlparts[2] or urlparts[2] == '/':
            urlparts[2] = comps
        else:
-            urlparts[2] = "%s%s" % (comps, parse.unquote(urlparts[2]))
+            urlparts[2] = "%s%s" % (comps, parse.unquote(decode_for_unquote(urlparts[2])))
        netloc = netloc[:i]
    else:
        # a leading ? in path causes urlsplit() to add the query to the
@ -217,7 +227,7 @@ def url_fix_host (urlparts):
        if i != -1:
            netloc, urlparts[3] = netloc.split('?', 1)
        # path
-        urlparts[2] = parse.unquote(urlparts[2])
+        urlparts[2] = parse.unquote(decode_for_unquote(urlparts[2]))
    if userpass:
        # append AT for easy concatenation
        userpass += "@"
@ -259,12 +269,9 @@ def url_fix_wayback_query(path):

 def url_parse_query (query, encoding=None):
    """Parse and re-join the given CGI query."""
-    if isinstance(query, str_text):
-        if encoding is None:
-            encoding = url_encoding
-        query = query.encode(encoding, 'ignore')
    # if ? is in the query, split it off, seen at msdn.microsoft.com
    append = ""
+    query = decode_for_unquote(query)
    while '?' in query:
        query, rest = query.rsplit('?', 1)
        append = '?'+url_parse_query(rest)+append
@ -314,7 +321,7 @@ def url_norm (url, encoding=None):
        encode_unicode = False
    urlparts = list(urlparse.urlsplit(url))
    # scheme
-    urlparts[0] = parse.unquote(urlparts[0]).lower()
+    urlparts[0] = parse.unquote(decode_for_unquote(urlparts[0])).lower()
    # mailto: urlsplit is broken
    if urlparts[0] == 'mailto':
        url_fix_mailto_urlsplit(urlparts)
@ -334,7 +341,7 @@ def url_norm (url, encoding=None):
            # fix redundant path parts
            urlparts[2] = collapse_segments(urlparts[2])
    # anchor
-    urlparts[4] = parse.unquote(urlparts[4])
+    urlparts[4] = parse.unquote(decode_for_unquote(urlparts[4]))
    # quote parts again
    urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
    urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host