Remove strformat.unicode_safe() and strformat.url_unicode_split()

All strings support Unicode in Python 3.
This commit is contained in:
Chris Mayo 2020-07-07 17:25:28 +01:00
parent 4cb5b6f2fa
commit d91a328224
9 changed files with 24 additions and 70 deletions

View file

@ -21,7 +21,7 @@ import os
import html
import urllib.parse
from .. import strformat, url as urlutil, log, LOG_CHECK
from .. import url as urlutil, log, LOG_CHECK
MAX_FILESIZE = 1024 * 1024 * 10 # 10MB
@ -103,16 +103,10 @@ def get_url_from(
@type extern: tuple(int, int) or None
"""
if base_url is not None:
base_url = strformat.unicode_safe(base_url)
# left strip for detection of URL scheme
base_url_stripped = base_url.lstrip()
else:
base_url_stripped = base_url
if parent_url is not None:
parent_url = strformat.unicode_safe(parent_url)
if base_ref is not None:
base_ref = strformat.unicode_safe(base_ref)
name = strformat.unicode_safe(name)
url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
if ":" in url:
scheme = url.split(":", 1)[0].lower()

View file

@ -16,6 +16,9 @@
"""
Handle http links.
"""
import urllib.parse
import requests
# The validity of SSL certs is ignored to be able
@ -34,7 +37,6 @@ import re
from .. import (
log,
LOG_CHECK,
strformat,
mimeutil,
url as urlutil,
LinkCheckerError,
@ -50,9 +52,6 @@ from requests.sessions import REDIRECT_STATI
HEADER_ENCODING = "iso-8859-1"
HTTP_SCHEMAS = ('http://', 'https://')
# helper alias
unicode_safe = strformat.unicode_safe
# match for robots meta element content attribute
nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
@ -273,7 +272,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# Reset extern and recalculate
self.extern = None
self.set_extern(newurl)
self.urlparts = strformat.url_unicode_split(newurl)
self.urlparts = urllib.parse.urlsplit(newurl)
self.build_url_parts()
self.url_connection = response
self.headers = response.headers
@ -286,15 +285,15 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.aggregate.plugin_manager.run_connection_plugins(self)
def getheader(self, name, default=None):
"""Get decoded header value.
"""Get header value.
@return: decoded header value or default of not found
@rtype: unicode or type of default
@return: header value or default of not found
@rtype: str
"""
value = self.headers.get(name)
if value is None:
return default
return unicode_safe(value, encoding=HEADER_ENCODING)
return value
def check_response(self):
"""Check final result and log it."""
@ -353,7 +352,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
value = self.headers['Refresh'].strip()
mo = refresh_re.match(value)
if mo:
url = unicode_safe(mo.group("url"))
url = mo.group("url")
name = "Refresh: header"
self.add_url(url, name=name)
if 'Content-Location' in self.headers:

View file

@ -16,15 +16,18 @@
"""
Intern URL pattern support.
"""
import re
import urllib.parse
from . import urlbase, absolute_url
from .. import strformat, url as urlutil
from .. import url as urlutil
def get_intern_pattern(url):
"""Return intern pattern for given URL. Redirections to the same
domain with or without "www." prepended are allowed."""
parts = strformat.url_unicode_split(url)
parts = urllib.parse.urlsplit(url)
scheme = parts[0].lower()
domain = parts[1].lower()
domain, is_idn = urlutil.idna_encode(domain)

View file

@ -53,9 +53,6 @@ from .const import (
)
from ..url import url_fix_wayback_query
# helper alias
unicode_safe = strformat.unicode_safe
# schemes that are invalid with an empty hostname
scheme_requires_host = ("ftp", "http", "telnet")
@ -389,7 +386,7 @@ class UrlBase:
self.build_url()
self.check_url_warnings()
except tuple(ExcSyntaxList) as msg:
self.set_result(unicode_safe(msg), valid=False)
self.set_result(msg, valid=False)
else:
self.set_cache_url()
@ -440,7 +437,7 @@ class UrlBase:
urlparts[2] = url_fix_wayback_query(urlparts[2])
self.url = urlutil.urlunsplit(urlparts)
# split into (modifiable) list
self.urlparts = strformat.url_unicode_split(self.url)
self.urlparts = urllib.parse.urlsplit(self.url)
self.build_url_parts()
# and unsplit again
self.url = urlutil.urlunsplit(self.urlparts)
@ -529,7 +526,7 @@ class UrlBase:
'host': self.host,
'msg': value,
}
self.set_result(unicode_safe(value), valid=False)
self.set_result(value, valid=False)
def check_content(self):
"""Check content of URL.
@ -776,7 +773,7 @@ class UrlBase:
"""
Return serialized url check data as unicode string.
"""
return unicode_safe(sep).join(
return sep.join(
[
"%s link" % self.scheme,
"base_url=%r" % self.base_url,

View file

@ -21,7 +21,6 @@ from io import StringIO
import os
from . import _Logger
from .. import strformat
Columns = (
"urlname",
@ -134,7 +133,7 @@ class CSVLogger(_Logger):
row.append(url_data.level)
if self.has_part("modified"):
row.append(self.format_modified(url_data.modified))
self.writerow(map(strformat.unicode_safe, row))
self.writerow(row)
self.flush()
def writerow(self, row):

View file

@ -22,7 +22,6 @@ import sys
import socket
from ..lock import get_lock
from ..decorators import synchronized
from ..strformat import unicode_safe
from .. import log, LOG_PLUGIN
@ -112,9 +111,9 @@ def get_location(host):
return None
value = ""
if record and record.get("city"):
value += unicode_safe(record["city"])
value += record["city"]
if record and record.get("country_name"):
if value:
value += ", "
value += unicode_safe(record["country_name"])
value += record["country_name"]
return value

View file

@ -30,7 +30,7 @@ except ImportError:
has_pdflib = False
else:
has_pdflib = True
from .. import log, LOG_PLUGIN, strformat
from .. import log, LOG_PLUGIN
def search_url(obj, url_data, pageno, seen_objs):
@ -44,14 +44,7 @@ def search_url(obj, url_data, pageno, seen_objs):
if isinstance(obj, dict):
for key, value in obj.items():
if key == 'URI':
if isinstance(value, str):
url = value
else:
# URIs should be 7bit ASCII encoded, but be safe and encode
# to unicode
# XXX this does not use an optional specified base URL
url = strformat.unicode_safe(value)
url_data.add_url(url, page=pageno)
url_data.add_url(value, page=pageno)
else:
search_url(value, url_data, pageno, seen_objs)
elif isinstance(obj, list):

View file

@ -26,10 +26,8 @@ import re
import textwrap
import os
import time
import urllib.parse
import locale
import pydoc
from . import i18n
# some handy time constants
SECONDS_PER_MINUTE = 60
@ -37,25 +35,6 @@ SECONDS_PER_HOUR = 60 * SECONDS_PER_MINUTE
SECONDS_PER_DAY = 24 * SECONDS_PER_HOUR
def unicode_safe(s, encoding=i18n.default_encoding, errors='replace'):
"""Get unicode string without raising encoding errors. Unknown
characters of the given encoding will be ignored.
@param s: the string to be decoded
@type s: any object except None
@return: if s is already unicode, return s unchanged; else return
decoded unicode string of str(s)
@rtype: unicode
"""
assert s is not None, "argument to unicode_safe was None"
if isinstance(s, str):
# s is already unicode, nothing to do
return s
elif isinstance(s, bytes):
return s.decode(encoding, errors)
return str(s)
def ascii_safe(s):
"""Get ASCII string without raising encoding errors. Unknown
characters of the given encoding will be ignored.
@ -70,11 +49,6 @@ def ascii_safe(s):
return s
def url_unicode_split(url):
"""Like urllib.parse.urlsplit(), but always returning unicode parts."""
return [unicode_safe(s) for s in urllib.parse.urlsplit(url)]
def unquote(s, matching=False):
"""Remove leading and ending single and double quotes.
The quotes need to match if matching is True. Only one quote from each

View file

@ -126,10 +126,6 @@ class TestStrFormat(unittest.TestCase):
duration(60 * 60 * 24 * 365 + 60 * 60 * 24 + 2), "1 year, 1 day"
)
def test_unicode_safe(self):
unicode_safe = linkcheck.strformat.unicode_safe
self.assertEqual(unicode_safe("a"), "a")
def test_ascii_safe(self):
ascii_safe = linkcheck.strformat.ascii_safe
self.assertEqual(ascii_safe("a"), "a")