Remove strformat.unicode_safe() and strformat.url_unicode_split()

All strings support Unicode in Python 3.
2026-05-18 11:31:06 +00:00 · 2020-07-07 17:25:28 +01:00 · 2020-07-07 17:25:28 +01:00 · d91a328224
commit d91a328224
parent 4cb5b6f2fa
9 changed files with 24 additions and 70 deletions
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -21,7 +21,7 @@ import os
 import html
 import urllib.parse

-from .. import strformat, url as urlutil, log, LOG_CHECK
+from .. import url as urlutil, log, LOG_CHECK

 MAX_FILESIZE = 1024 * 1024 * 10  # 10MB

@ -103,16 +103,10 @@ def get_url_from(
    @type extern: tuple(int, int) or None
    """
    if base_url is not None:
-        base_url = strformat.unicode_safe(base_url)
        # left strip for detection of URL scheme
        base_url_stripped = base_url.lstrip()
    else:
        base_url_stripped = base_url
-    if parent_url is not None:
-        parent_url = strformat.unicode_safe(parent_url)
-    if base_ref is not None:
-        base_ref = strformat.unicode_safe(base_ref)
-    name = strformat.unicode_safe(name)
    url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
    if ":" in url:
        scheme = url.split(":", 1)[0].lower()
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -16,6 +16,9 @@
 """
 Handle http links.
 """
+
+import urllib.parse
+
 import requests

 # The validity of SSL certs is ignored to be able
@ -34,7 +37,6 @@ import re
 from .. import (
    log,
    LOG_CHECK,
-    strformat,
    mimeutil,
    url as urlutil,
    LinkCheckerError,
@ -50,9 +52,6 @@ from requests.sessions import REDIRECT_STATI
 HEADER_ENCODING = "iso-8859-1"
 HTTP_SCHEMAS = ('http://', 'https://')

-# helper alias
-unicode_safe = strformat.unicode_safe
-
 # match for robots meta element content attribute
 nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)

@ -273,7 +272,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
            # Reset extern and recalculate
            self.extern = None
            self.set_extern(newurl)
-            self.urlparts = strformat.url_unicode_split(newurl)
+            self.urlparts = urllib.parse.urlsplit(newurl)
            self.build_url_parts()
            self.url_connection = response
            self.headers = response.headers
@ -286,15 +285,15 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
                self.aggregate.plugin_manager.run_connection_plugins(self)

    def getheader(self, name, default=None):
-        """Get decoded header value.
+        """Get header value.

-        @return: decoded header value or default of not found
-        @rtype: unicode or type of default
+        @return: header value or default of not found
+        @rtype: str
        """
        value = self.headers.get(name)
        if value is None:
            return default
-        return unicode_safe(value, encoding=HEADER_ENCODING)
+        return value

    def check_response(self):
        """Check final result and log it."""
@ -353,7 +352,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
            value = self.headers['Refresh'].strip()
            mo = refresh_re.match(value)
            if mo:
-                url = unicode_safe(mo.group("url"))
+                url = mo.group("url")
                name = "Refresh: header"
                self.add_url(url, name=name)
        if 'Content-Location' in self.headers:
--- a/linkcheck/checker/internpaturl.py
+++ b/linkcheck/checker/internpaturl.py
@ -16,15 +16,18 @@
 """
 Intern URL pattern support.
 """
+
 import re
+import urllib.parse
+
 from . import urlbase, absolute_url
-from .. import strformat, url as urlutil
+from .. import url as urlutil


 def get_intern_pattern(url):
    """Return intern pattern for given URL. Redirections to the same
    domain with or without "www." prepended are allowed."""
-    parts = strformat.url_unicode_split(url)
+    parts = urllib.parse.urlsplit(url)
    scheme = parts[0].lower()
    domain = parts[1].lower()
    domain, is_idn = urlutil.idna_encode(domain)
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -53,9 +53,6 @@ from .const import (
 )
 from ..url import url_fix_wayback_query

-# helper alias
-unicode_safe = strformat.unicode_safe
-
 # schemes that are invalid with an empty hostname
 scheme_requires_host = ("ftp", "http", "telnet")

@ -389,7 +386,7 @@ class UrlBase:
            self.build_url()
            self.check_url_warnings()
        except tuple(ExcSyntaxList) as msg:
-            self.set_result(unicode_safe(msg), valid=False)
+            self.set_result(msg, valid=False)
        else:
            self.set_cache_url()

@ -440,7 +437,7 @@ class UrlBase:
                urlparts[2] = url_fix_wayback_query(urlparts[2])
        self.url = urlutil.urlunsplit(urlparts)
        # split into (modifiable) list
-        self.urlparts = strformat.url_unicode_split(self.url)
+        self.urlparts = urllib.parse.urlsplit(self.url)
        self.build_url_parts()
        # and unsplit again
        self.url = urlutil.urlunsplit(self.urlparts)
@ -529,7 +526,7 @@ class UrlBase:
                    'host': self.host,
                    'msg': value,
                }
-            self.set_result(unicode_safe(value), valid=False)
+            self.set_result(value, valid=False)

    def check_content(self):
        """Check content of URL.
@ -776,7 +773,7 @@ class UrlBase:
        """
        Return serialized url check data as unicode string.
        """
-        return unicode_safe(sep).join(
+        return sep.join(
            [
                "%s link" % self.scheme,
                "base_url=%r" % self.base_url,
--- a/linkcheck/logger/csvlog.py
+++ b/linkcheck/logger/csvlog.py
@ -21,7 +21,6 @@ from io import StringIO
 import os

 from . import _Logger
-from .. import strformat

 Columns = (
    "urlname",
@ -134,7 +133,7 @@ class CSVLogger(_Logger):
            row.append(url_data.level)
        if self.has_part("modified"):
            row.append(self.format_modified(url_data.modified))
-        self.writerow(map(strformat.unicode_safe, row))
+        self.writerow(row)
        self.flush()

    def writerow(self, row):
--- a/linkcheck/plugins/locationinfo.py
+++ b/linkcheck/plugins/locationinfo.py
@ -22,7 +22,6 @@ import sys
 import socket
 from ..lock import get_lock
 from ..decorators import synchronized
-from ..strformat import unicode_safe
 from .. import log, LOG_PLUGIN


@ -112,9 +111,9 @@ def get_location(host):
        return None
    value = ""
    if record and record.get("city"):
-        value += unicode_safe(record["city"])
+        value += record["city"]
    if record and record.get("country_name"):
        if value:
            value += ", "
-        value += unicode_safe(record["country_name"])
+        value += record["country_name"]
    return value
--- a/linkcheck/plugins/parsepdf.py
+++ b/linkcheck/plugins/parsepdf.py
@ -30,7 +30,7 @@ except ImportError:
    has_pdflib = False
 else:
    has_pdflib = True
-from .. import log, LOG_PLUGIN, strformat
+from .. import log, LOG_PLUGIN


 def search_url(obj, url_data, pageno, seen_objs):
@ -44,14 +44,7 @@ def search_url(obj, url_data, pageno, seen_objs):
    if isinstance(obj, dict):
        for key, value in obj.items():
            if key == 'URI':
-                if isinstance(value, str):
-                    url = value
-                else:
-                    # URIs should be 7bit ASCII encoded, but be safe and encode
-                    # to unicode
-                    # XXX this does not use an optional specified base URL
-                    url = strformat.unicode_safe(value)
-                url_data.add_url(url, page=pageno)
+                url_data.add_url(value, page=pageno)
            else:
                search_url(value, url_data, pageno, seen_objs)
    elif isinstance(obj, list):
--- a/linkcheck/strformat.py
+++ b/linkcheck/strformat.py
@ -26,10 +26,8 @@ import re
 import textwrap
 import os
 import time
-import urllib.parse
 import locale
 import pydoc
-from . import i18n

 # some handy time constants
 SECONDS_PER_MINUTE = 60
@ -37,25 +35,6 @@ SECONDS_PER_HOUR = 60 * SECONDS_PER_MINUTE
 SECONDS_PER_DAY = 24 * SECONDS_PER_HOUR


-def unicode_safe(s, encoding=i18n.default_encoding, errors='replace'):
-    """Get unicode string without raising encoding errors. Unknown
-    characters of the given encoding will be ignored.
-
-    @param s: the string to be decoded
-    @type s: any object except None
-    @return: if s is already unicode, return s unchanged; else return
-        decoded unicode string of str(s)
-    @rtype: unicode
-    """
-    assert s is not None, "argument to unicode_safe was None"
-    if isinstance(s, str):
-        # s is already unicode, nothing to do
-        return s
-    elif isinstance(s, bytes):
-        return s.decode(encoding, errors)
-    return str(s)
-
-
 def ascii_safe(s):
    """Get ASCII string without raising encoding errors. Unknown
    characters of the given encoding will be ignored.
@ -70,11 +49,6 @@ def ascii_safe(s):
    return s


-def url_unicode_split(url):
-    """Like urllib.parse.urlsplit(), but always returning unicode parts."""
-    return [unicode_safe(s) for s in urllib.parse.urlsplit(url)]
-
-
 def unquote(s, matching=False):
    """Remove leading and ending single and double quotes.
    The quotes need to match if matching is True. Only one quote from each
--- a/tests/test_strformat.py
+++ b/tests/test_strformat.py
@ -126,10 +126,6 @@ class TestStrFormat(unittest.TestCase):
            duration(60 * 60 * 24 * 365 + 60 * 60 * 24 + 2), "1 year, 1 day"
        )

-    def test_unicode_safe(self):
-        unicode_safe = linkcheck.strformat.unicode_safe
-        self.assertEqual(unicode_safe("a"), "a")
-
    def test_ascii_safe(self):
        ascii_safe = linkcheck.strformat.ascii_safe
        self.assertEqual(ascii_safe("a"), "a")