Replace deprecated urllib.parse.split functions

This commit is contained in:
Chris Mayo 2020-08-22 16:28:53 +01:00
parent f99f15c349
commit 8779c39735
3 changed files with 37 additions and 21 deletions

View file

@ -17,8 +17,6 @@
Handle http links.
"""
import urllib.parse
import requests
# The validity of SSL certs is ignored to be able
@ -273,8 +271,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# Reset extern and recalculate
self.extern = None
self.set_extern(newurl)
self.urlparts = list(urllib.parse.urlsplit(newurl))
self.build_url_parts()
self.urlparts = self.build_url_parts(newurl)
self.url_connection = response
self.headers = response.headers
self.url = urlutil.urlunsplit(self.urlparts)

View file

@ -436,27 +436,33 @@ class UrlBase:
# restore second / in http[s]:// in wayback path
urlparts[2] = url_fix_wayback_query(urlparts[2])
self.url = urlutil.urlunsplit(urlparts)
# split into (modifiable) list
self.urlparts = list(urllib.parse.urlsplit(self.url))
self.build_url_parts()
self.urlparts = self.build_url_parts(self.url)
# and unsplit again
self.url = urlutil.urlunsplit(self.urlparts)
def build_url_parts(self):
"""Set userinfo, host, port and anchor from self.urlparts.
def build_url_parts(self, url):
"""Set userinfo, host, port and anchor from url and return urlparts.
Also checks for obfuscated IP addresses.
"""
split = urllib.parse.urlsplit(url)
urlparts = list(split)
# check userinfo@host:port syntax
self.userinfo, host = urllib.parse.splituser(self.urlparts[1])
port = urlutil.default_ports.get(self.scheme, 0)
host, port = urlutil.splitport(host, port=port)
self.userinfo, host = urlutil.split_netloc(split.netloc)
try:
port = split.port
except ValueError:
raise LinkCheckerError(
_("URL host %(host)r has invalid port") % {"host": host}
)
if port is None:
port = urlutil.default_ports.get(self.scheme, 0)
if port is None:
raise LinkCheckerError(
_("URL host %(host)r has invalid port") % {"host": host}
)
self.port = port
# set host lowercase
self.host = host.lower()
# urllib.parse.SplitResult.hostname is lowercase
self.host = split.hostname
if self.scheme in scheme_requires_host:
if not self.host:
raise LinkCheckerError(_("URL has empty hostname"))
@ -466,13 +472,14 @@ class UrlBase:
else:
host = "%s:%d" % (self.host, self.port)
if self.userinfo:
self.urlparts[1] = "%s@%s" % (self.userinfo, host)
urlparts[1] = "%s@%s" % (self.userinfo, host)
else:
self.urlparts[1] = host
urlparts[1] = host
# safe anchor for later checking
self.anchor = self.urlparts[4]
self.anchor = split.fragment
if self.anchor is not None:
assert isinstance(self.anchor, str), repr(self.anchor)
return urlparts
def check_obfuscated_ip(self):
"""Warn if host of this URL is obfuscated IP address."""
@ -745,7 +752,8 @@ class UrlBase:
"""
if self.userinfo:
# URL itself has authentication info
return urllib.parse.splitpasswd(self.userinfo)
split = urllib.parse.urlsplit(self.url)
return (split.username, split.password)
return self.aggregate.config.get_user_password(self.url)
def add_url(self, url, line=0, column=0, page=0, name="", base=None):

View file

@ -193,16 +193,24 @@ def idna_encode(host):
return host, False
def split_netloc(netloc):
"""Separate userinfo from host in urllib.parse.SplitResult.netloc.
Originated as urllib.parse._splituser().
"""
userinfo, delim, hostport = netloc.rpartition('@')
return (userinfo if delim else None), hostport
def url_fix_host(urlparts, encoding):
"""Unquote and fix hostname. Returns is_idn."""
if not urlparts[1]:
urlparts[2] = urllib.parse.unquote(urlparts[2], encoding=encoding)
return False
userpass, netloc = urllib.parse.splituser(urlparts[1])
userpass, hostport = split_netloc(urlparts[1])
if userpass:
userpass = urllib.parse.unquote(userpass, encoding=encoding)
netloc, is_idn = idna_encode(
urllib.parse.unquote(netloc, encoding=encoding).lower()
urllib.parse.unquote(hostport, encoding=encoding).lower()
)
# a leading backslash in path causes urlsplit() to add the
# path components up to the first slash to host
@ -419,7 +427,10 @@ def url_quote(url, encoding):
def document_quote(document):
"""Quote given document."""
doc, query = urllib.parse.splitquery(document)
doc, delim, query = document.rpartition('?')
if not delim:
doc = document
query = None
doc = urllib.parse.quote(doc, safe='/=,')
if query:
return "%s?%s" % (doc, query)