diff --git a/linkcheck/bookmarks/chromium.py b/linkcheck/bookmarks/chromium.py index e825d55f..9e871b71 100644 --- a/linkcheck/bookmarks/chromium.py +++ b/linkcheck/bookmarks/chromium.py @@ -20,8 +20,6 @@ import sys import json from xdg.BaseDirectory import xdg_config_home -from .. import url as urlutil - # Windows filename encoding nt_filename_encoding="mbcs" @@ -71,7 +69,7 @@ def parse_bookmark_data (data): Return iterator for bookmarks of the form (url, name). Bookmarks are not sorted. """ - for url, name in parse_bookmark_json(json.loads(urlutil.decode_for_unquote(data))): + for url, name in parse_bookmark_json(json.loads(data)): yield url, name diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 3e6591ea..75ca1820 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -58,11 +58,11 @@ def absolute_url (base_url, base_ref, parent_url): @param parent_url: url of parent document @type parent_url: string or None """ - if base_url and urlutil.url_is_absolute(urlutil.decode_for_unquote(base_url)): + if base_url and urlutil.url_is_absolute(base_url): return base_url - elif base_ref and urlutil.url_is_absolute(urlutil.decode_for_unquote(base_ref)): + elif base_ref and urlutil.url_is_absolute(base_ref): return base_ref - elif parent_url and urlutil.url_is_absolute(urlutil.decode_for_unquote(parent_url)): + elif parent_url and urlutil.url_is_absolute(parent_url): return parent_url return u"" diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 10055f4e..76bb4462 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -20,7 +20,6 @@ Handle local file: links. import re import os -import sys try: import urlparse except ImportError: @@ -84,7 +83,7 @@ def get_os_filename (path): """Return filesystem path for given URL path.""" if os.name == 'nt': path = prepare_urlpath_for_nt(path) - res = urlrequest.url2pathname(urlutil.decode_for_unquote(fileutil.pathencode(path))) + res = urlrequest.url2pathname(fileutil.pathencode(path)) if os.name == 'nt' and res.endswith(':') and len(res) == 2: # Work around http://bugs.python.org/issue11474 res += os.sep @@ -193,10 +192,7 @@ class FileUrl (urlbase.UrlBase): if self.is_directory(): self.set_result(_("directory")) else: - if sys.version_info.major < 3: - url = fileutil.pathencode(self.url) - else: - url = self.url + url = fileutil.pathencode(self.url) self.url_connection = urlopen(url) self.check_case_sensitivity() diff --git a/linkcheck/checker/mailtourl.py b/linkcheck/checker/mailtourl.py index 50ab2ae7..27ef52b3 100644 --- a/linkcheck/checker/mailtourl.py +++ b/linkcheck/checker/mailtourl.py @@ -158,7 +158,7 @@ class MailtoUrl (urlbase.UrlBase): {"addr": mail}, valid=False, overwrite=False) return # note: be sure to use rsplit since "@" can occur in local part - local, domain = urlutil.decode_for_unquote(mail).rsplit("@", 1) + local, domain = mail.rsplit("@", 1) if not local: self.set_result(_("Missing local part of mail address `%(addr)s'.") % \ {"addr": mail}, valid=False, overwrite=False) @@ -247,7 +247,7 @@ class MailtoUrl (urlbase.UrlBase): from dns.exception import DNSException log.debug(LOG_CHECK, "checking mail address %r", mail) mail = strformat.ascii_safe(mail) - username, domain = urlutil.decode_for_unquote(mail).rsplit('@', 1) + username, domain = mail.rsplit('@', 1) log.debug(LOG_CHECK, "looking up MX mailhost %r", domain) try: answers = resolver.query(domain, 'MX') diff --git a/linkcheck/htmlutil/linkname.py b/linkcheck/htmlutil/linkname.py index 50384a27..9cb65dfa 100644 --- a/linkcheck/htmlutil/linkname.py +++ b/linkcheck/htmlutil/linkname.py @@ -19,7 +19,7 @@ Parse names of title tags and link types. """ import re -from .. import HtmlParser, strformat, url as urlutil +from .. import HtmlParser, strformat imgtag_re = re.compile(r"(?i)\s+alt\s*=\s*"+\ @@ -52,7 +52,6 @@ def image_name (txt): def href_name (txt): """Return the name part of the first name link in txt.""" name = u"" - txt = urlutil.decode_for_unquote(txt) endtag = a_end_search(txt) if not endtag: return name diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index 32c7c1c2..3d97be37 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -93,7 +93,6 @@ c_comment_re = re.compile(r"/\*.*?\*/", re.DOTALL) def strip_c_comments (text): """Remove C/CSS-style comments from text. Note that this method also deliberately removes comments inside of strings.""" - text = urlutil.decode_for_unquote(text) return c_comment_re.sub('', text) diff --git a/linkcheck/url.py b/linkcheck/url.py index ff40646c..9c68da04 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -92,16 +92,6 @@ is_safe_query = re.compile("(?i)^%s$" % _safe_query_pattern).match is_safe_fragment = re.compile("(?i)^%s$" % _safe_fragment_pattern).match -def decode_for_unquote(part): - """ - Decode string for unquote function - To string in Python 3, leave it in Python 2 - """ - if not isinstance(part, (str, str_text)): - # Python 3: we probably got bytes - part = part.decode("utf-8", "replace") - return part - # snatched form urlparse.py def splitparams (path): """Split off parameter part from path. @@ -202,9 +192,9 @@ def idna_encode (host): def url_fix_host (urlparts): """Unquote and fix hostname. Returns is_idn.""" if not urlparts[1]: - urlparts[2] = parse.unquote(decode_for_unquote(urlparts[2])) + urlparts[2] = parse.unquote(urlparts[2]) return False - userpass, netloc = parse.splituser(decode_for_unquote(urlparts[1])) + userpass, netloc = parse.splituser(urlparts[1]) if userpass: userpass = parse.unquote(userpass) netloc, is_idn = idna_encode(parse.unquote(netloc).lower()) @@ -218,7 +208,7 @@ def url_fix_host (urlparts): if not urlparts[2] or urlparts[2] == '/': urlparts[2] = comps else: - urlparts[2] = "%s%s" % (comps, parse.unquote(decode_for_unquote(urlparts[2]))) + urlparts[2] = "%s%s" % (comps, parse.unquote(urlparts[2])) netloc = netloc[:i] else: # a leading ? in path causes urlsplit() to add the query to the @@ -227,7 +217,7 @@ def url_fix_host (urlparts): if i != -1: netloc, urlparts[3] = netloc.split('?', 1) # path - urlparts[2] = parse.unquote(decode_for_unquote(urlparts[2])) + urlparts[2] = parse.unquote(urlparts[2]) if userpass: # append AT for easy concatenation userpass += "@" @@ -272,7 +262,6 @@ def url_parse_query (query, encoding=None): """Parse and re-join the given CGI query.""" # if ? is in the query, split it off, seen at msdn.microsoft.com append = "" - query = decode_for_unquote(query) while '?' in query: query, rest = query.rsplit('?', 1) append = '?'+url_parse_query(rest)+append @@ -322,7 +311,7 @@ def url_norm (url, encoding=None): encode_unicode = False urlparts = list(urlparse.urlsplit(url)) # scheme - urlparts[0] = parse.unquote(decode_for_unquote(urlparts[0])).lower() + urlparts[0] = parse.unquote(urlparts[0]).lower() # mailto: urlsplit is broken if urlparts[0] == 'mailto': url_fix_mailto_urlsplit(urlparts) @@ -342,7 +331,7 @@ def url_norm (url, encoding=None): # fix redundant path parts urlparts[2] = collapse_segments(urlparts[2]) # anchor - urlparts[4] = parse.unquote(decode_for_unquote(urlparts[4])) + urlparts[4] = parse.unquote(urlparts[4]) # quote parts again urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host @@ -351,7 +340,7 @@ def url_norm (url, encoding=None): urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path urlparts[4] = url_quote_part(urlparts[4], safechars="!$&'()*+,-./;=?@_~", encoding=encoding) # anchor res = urlunsplit(urlparts) - if decode_for_unquote(url).endswith('#') and not urlparts[4]: + if url.endswith('#') and not urlparts[4]: # re-append trailing empty fragment res += '#' if encode_unicode: