diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 5c0f4cc0..b3c2164e 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -21,7 +21,7 @@ import os import html import urllib.parse -from .. import strformat, url as urlutil, log, LOG_CHECK +from .. import url as urlutil, log, LOG_CHECK MAX_FILESIZE = 1024 * 1024 * 10 # 10MB @@ -103,16 +103,10 @@ def get_url_from( @type extern: tuple(int, int) or None """ if base_url is not None: - base_url = strformat.unicode_safe(base_url) # left strip for detection of URL scheme base_url_stripped = base_url.lstrip() else: base_url_stripped = base_url - if parent_url is not None: - parent_url = strformat.unicode_safe(parent_url) - if base_ref is not None: - base_ref = strformat.unicode_safe(base_ref) - name = strformat.unicode_safe(name) url = absolute_url(base_url_stripped, base_ref, parent_url).lower() if ":" in url: scheme = url.split(":", 1)[0].lower() diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index bb67b4d4..c8b6d149 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -16,6 +16,9 @@ """ Handle http links. """ + +import urllib.parse + import requests # The validity of SSL certs is ignored to be able @@ -34,7 +37,6 @@ import re from .. import ( log, LOG_CHECK, - strformat, mimeutil, url as urlutil, LinkCheckerError, @@ -50,9 +52,6 @@ from requests.sessions import REDIRECT_STATI HEADER_ENCODING = "iso-8859-1" HTTP_SCHEMAS = ('http://', 'https://') -# helper alias -unicode_safe = strformat.unicode_safe - # match for robots meta element content attribute nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE) @@ -273,7 +272,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): # Reset extern and recalculate self.extern = None self.set_extern(newurl) - self.urlparts = strformat.url_unicode_split(newurl) + self.urlparts = urllib.parse.urlsplit(newurl) self.build_url_parts() self.url_connection = response self.headers = response.headers @@ -286,15 +285,15 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.aggregate.plugin_manager.run_connection_plugins(self) def getheader(self, name, default=None): - """Get decoded header value. + """Get header value. - @return: decoded header value or default of not found - @rtype: unicode or type of default + @return: header value or default of not found + @rtype: str """ value = self.headers.get(name) if value is None: return default - return unicode_safe(value, encoding=HEADER_ENCODING) + return value def check_response(self): """Check final result and log it.""" @@ -353,7 +352,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): value = self.headers['Refresh'].strip() mo = refresh_re.match(value) if mo: - url = unicode_safe(mo.group("url")) + url = mo.group("url") name = "Refresh: header" self.add_url(url, name=name) if 'Content-Location' in self.headers: diff --git a/linkcheck/checker/internpaturl.py b/linkcheck/checker/internpaturl.py index ceeeced0..5e54392d 100644 --- a/linkcheck/checker/internpaturl.py +++ b/linkcheck/checker/internpaturl.py @@ -16,15 +16,18 @@ """ Intern URL pattern support. """ + import re +import urllib.parse + from . import urlbase, absolute_url -from .. import strformat, url as urlutil +from .. import url as urlutil def get_intern_pattern(url): """Return intern pattern for given URL. Redirections to the same domain with or without "www." prepended are allowed.""" - parts = strformat.url_unicode_split(url) + parts = urllib.parse.urlsplit(url) scheme = parts[0].lower() domain = parts[1].lower() domain, is_idn = urlutil.idna_encode(domain) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 8a77994b..c1a1f6b8 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -53,9 +53,6 @@ from .const import ( ) from ..url import url_fix_wayback_query -# helper alias -unicode_safe = strformat.unicode_safe - # schemes that are invalid with an empty hostname scheme_requires_host = ("ftp", "http", "telnet") @@ -389,7 +386,7 @@ class UrlBase: self.build_url() self.check_url_warnings() except tuple(ExcSyntaxList) as msg: - self.set_result(unicode_safe(msg), valid=False) + self.set_result(msg, valid=False) else: self.set_cache_url() @@ -440,7 +437,7 @@ class UrlBase: urlparts[2] = url_fix_wayback_query(urlparts[2]) self.url = urlutil.urlunsplit(urlparts) # split into (modifiable) list - self.urlparts = strformat.url_unicode_split(self.url) + self.urlparts = urllib.parse.urlsplit(self.url) self.build_url_parts() # and unsplit again self.url = urlutil.urlunsplit(self.urlparts) @@ -529,7 +526,7 @@ class UrlBase: 'host': self.host, 'msg': value, } - self.set_result(unicode_safe(value), valid=False) + self.set_result(value, valid=False) def check_content(self): """Check content of URL. @@ -776,7 +773,7 @@ class UrlBase: """ Return serialized url check data as unicode string. """ - return unicode_safe(sep).join( + return sep.join( [ "%s link" % self.scheme, "base_url=%r" % self.base_url, diff --git a/linkcheck/logger/csvlog.py b/linkcheck/logger/csvlog.py index 76cfc493..943b3aa5 100644 --- a/linkcheck/logger/csvlog.py +++ b/linkcheck/logger/csvlog.py @@ -21,7 +21,6 @@ from io import StringIO import os from . import _Logger -from .. import strformat Columns = ( "urlname", @@ -134,7 +133,7 @@ class CSVLogger(_Logger): row.append(url_data.level) if self.has_part("modified"): row.append(self.format_modified(url_data.modified)) - self.writerow(map(strformat.unicode_safe, row)) + self.writerow(row) self.flush() def writerow(self, row): diff --git a/linkcheck/plugins/locationinfo.py b/linkcheck/plugins/locationinfo.py index b7045720..502ae090 100644 --- a/linkcheck/plugins/locationinfo.py +++ b/linkcheck/plugins/locationinfo.py @@ -22,7 +22,6 @@ import sys import socket from ..lock import get_lock from ..decorators import synchronized -from ..strformat import unicode_safe from .. import log, LOG_PLUGIN @@ -112,9 +111,9 @@ def get_location(host): return None value = "" if record and record.get("city"): - value += unicode_safe(record["city"]) + value += record["city"] if record and record.get("country_name"): if value: value += ", " - value += unicode_safe(record["country_name"]) + value += record["country_name"] return value diff --git a/linkcheck/plugins/parsepdf.py b/linkcheck/plugins/parsepdf.py index bdff0f87..8659fdfd 100755 --- a/linkcheck/plugins/parsepdf.py +++ b/linkcheck/plugins/parsepdf.py @@ -30,7 +30,7 @@ except ImportError: has_pdflib = False else: has_pdflib = True -from .. import log, LOG_PLUGIN, strformat +from .. import log, LOG_PLUGIN def search_url(obj, url_data, pageno, seen_objs): @@ -44,14 +44,7 @@ def search_url(obj, url_data, pageno, seen_objs): if isinstance(obj, dict): for key, value in obj.items(): if key == 'URI': - if isinstance(value, str): - url = value - else: - # URIs should be 7bit ASCII encoded, but be safe and encode - # to unicode - # XXX this does not use an optional specified base URL - url = strformat.unicode_safe(value) - url_data.add_url(url, page=pageno) + url_data.add_url(value, page=pageno) else: search_url(value, url_data, pageno, seen_objs) elif isinstance(obj, list): diff --git a/linkcheck/strformat.py b/linkcheck/strformat.py index 923cba7d..62338042 100644 --- a/linkcheck/strformat.py +++ b/linkcheck/strformat.py @@ -26,10 +26,8 @@ import re import textwrap import os import time -import urllib.parse import locale import pydoc -from . import i18n # some handy time constants SECONDS_PER_MINUTE = 60 @@ -37,25 +35,6 @@ SECONDS_PER_HOUR = 60 * SECONDS_PER_MINUTE SECONDS_PER_DAY = 24 * SECONDS_PER_HOUR -def unicode_safe(s, encoding=i18n.default_encoding, errors='replace'): - """Get unicode string without raising encoding errors. Unknown - characters of the given encoding will be ignored. - - @param s: the string to be decoded - @type s: any object except None - @return: if s is already unicode, return s unchanged; else return - decoded unicode string of str(s) - @rtype: unicode - """ - assert s is not None, "argument to unicode_safe was None" - if isinstance(s, str): - # s is already unicode, nothing to do - return s - elif isinstance(s, bytes): - return s.decode(encoding, errors) - return str(s) - - def ascii_safe(s): """Get ASCII string without raising encoding errors. Unknown characters of the given encoding will be ignored. @@ -70,11 +49,6 @@ def ascii_safe(s): return s -def url_unicode_split(url): - """Like urllib.parse.urlsplit(), but always returning unicode parts.""" - return [unicode_safe(s) for s in urllib.parse.urlsplit(url)] - - def unquote(s, matching=False): """Remove leading and ending single and double quotes. The quotes need to match if matching is True. Only one quote from each diff --git a/tests/test_strformat.py b/tests/test_strformat.py index d1c7d1a5..bfbc08e0 100644 --- a/tests/test_strformat.py +++ b/tests/test_strformat.py @@ -126,10 +126,6 @@ class TestStrFormat(unittest.TestCase): duration(60 * 60 * 24 * 365 + 60 * 60 * 24 + 2), "1 year, 1 day" ) - def test_unicode_safe(self): - unicode_safe = linkcheck.strformat.unicode_safe - self.assertEqual(unicode_safe("a"), "a") - def test_ascii_safe(self): ascii_safe = linkcheck.strformat.ascii_safe self.assertEqual(ascii_safe("a"), "a")