Merge pull request #382 from cjmayo/tidyten5

Make urllib imports and html.escape Python 3 only
This commit is contained in:
Chris Mayo 2020-05-15 19:15:47 +01:00 committed by GitHub
commit f3eb787014
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 103 additions and 181 deletions

View file

@ -18,11 +18,9 @@ Main functions for link checking.
"""
import os
from html import escape as html_escape
try: # Python 3
from urllib import parse as urlparse
except ImportError:
import urllib as urlparse
import html
import urllib.parse
from .. import strformat, url as urlutil, log, LOG_CHECK
MAX_FILESIZE = 1024*1024*10 # 10MB
@ -165,9 +163,9 @@ def get_index_html (urls):
"""
lines = ["<html>", "<body>"]
for entry in urls:
name = html_escape(entry)
name = html.escape(entry)
try:
url = html_escape(urlparse.quote(entry))
url = html.escape(urllib.parse.quote(entry))
except KeyError:
# Some unicode entries raise KeyError.
url = name

View file

@ -19,20 +19,8 @@ Handle local file: links.
import re
import os
try:
import urlparse
except ImportError:
# Python 3
from urllib import parse as urlparse
try: # Python 3
from urllib import request as urlrequest
except ImportError:
import urllib as urlrequest
try:
from urllib2 import urlopen
except ImportError:
# Python 3
from urllib.request import urlopen
import urllib.parse
import urllib.request
from builtins import str as str_text
from datetime import datetime
@ -82,7 +70,7 @@ def get_os_filename (path):
"""Return filesystem path for given URL path."""
if os.name == 'nt':
path = prepare_urlpath_for_nt(path)
res = urlrequest.url2pathname(fileutil.path_safe(path))
res = urllib.request.url2pathname(fileutil.path_safe(path))
if os.name == 'nt' and res.endswith(':') and len(res) == 2:
# Work around http://bugs.python.org/issue11474
res += os.sep
@ -153,7 +141,7 @@ class FileUrl (urlbase.UrlBase):
from .urlbase import url_norm
# norm base url - can raise UnicodeError from url.idna_encode()
base_url, is_idn = url_norm(self.base_url, self.encoding)
urlparts = list(urlparse.urlsplit(base_url))
urlparts = list(urllib.parse.urlsplit(base_url))
# ignore query part for filesystem urls
urlparts[3] = ''
self.base_url = urlutil.urlunsplit(urlparts)
@ -189,7 +177,7 @@ class FileUrl (urlbase.UrlBase):
self.set_result(_("directory"))
else:
url = fileutil.path_safe(self.url)
self.url_connection = urlopen(url)
self.url_connection = urllib.request.urlopen(url)
self.check_case_sensitivity()
def check_case_sensitivity (self):

View file

@ -18,11 +18,7 @@ Handle for mailto: links.
"""
import re
try:
import urlparse
except ImportError:
# Python 3
from urllib import parse as urlparse
import urllib.parse
from email._parseaddr import AddressList
from . import urlbase
@ -94,7 +90,7 @@ class MailtoUrl (urlbase.UrlBase):
Stores parsed addresses in the self.addresses set.
"""
# cut off leading mailto: and unquote
url = urlparse.unquote(self.base_url[7:], self.encoding)
url = urllib.parse.unquote(self.base_url[7:], self.encoding)
# search for cc, bcc, to and store in headers
mode = 0 # 0=default, 1=quote, 2=esc
quote = None
@ -118,11 +114,11 @@ class MailtoUrl (urlbase.UrlBase):
if i < (len(url) - 1):
self.addresses.update(getaddresses(url[:i]))
try:
headers = urlparse.parse_qs(url[(i+1):], strict_parsing=True)
headers = urllib.parse.parse_qs(url[(i+1):], strict_parsing=True)
for key, vals in headers.items():
if key.lower() in EMAIL_CGI_ADDRESS:
# Only the first header value is added
self.addresses.update(getaddresses(urlparse.unquote(vals[0], self.encoding)))
self.addresses.update(getaddresses(urllib.parse.unquote(vals[0], self.encoding)))
if key.lower() == EMAIL_CGI_SUBJECT:
self.subject = vals[0]
except ValueError as err:

View file

@ -16,14 +16,8 @@
"""
Mixin class for URLs that can be fetched over a proxy.
"""
try: # Python 3
from urllib import parse
from urllib import request
from urllib.parse import splitport
except ImportError:
from urllib import splitport
import urllib as request
import urlparse as parse
import urllib.parse
import urllib.request
import os
from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil, httputil
@ -41,7 +35,7 @@ class ProxySupport:
self.proxyauth = None
if not self.proxy:
return
proxyurl = parse.urlparse(self.proxy)
proxyurl = urllib.parse.urlparse(self.proxy)
self.proxytype = proxyurl.scheme
if self.proxytype not in ('http', 'https'):
# Note that invalid proxies might raise TypeError in urllib2,
@ -68,11 +62,11 @@ class ProxySupport:
def ignore_proxy_host (self):
"""Check if self.host is in the $no_proxy ignore list."""
if request.proxy_bypass(self.host):
if urllib.request.proxy_bypass(self.host):
return True
no_proxy = os.environ.get("no_proxy")
if no_proxy:
entries = [parse_host_port(x) for x in no_proxy.split(",")]
entries = [urlutil.splitport(x.strip()) for x in no_proxy.split(",")]
for host, port in entries:
if host.lower() == self.host and port == self.port:
return True
@ -93,12 +87,3 @@ class ProxySupport:
host = self.host
port = self.port
return (scheme, host, port)
def parse_host_port (host_port):
"""Parse a host:port string into separate components."""
host, port = splitport(host_port.strip())
if port is not None:
if urlutil.is_numeric_port(port):
port = int(port)
return host, port

View file

@ -18,21 +18,8 @@ Base URL handler.
"""
import sys
import os
try:
import urlparse
except ImportError:
# Python 3
from urllib import parse as urlparse
try: # Python 3
from urllib import parse as urllib_parse
except ImportError:
import urllib as urllib_parse
try:
from urllib2 import urlopen
except ImportError:
# Python 3
from urllib.request import urlopen
import urllib
import urllib.parse
from urllib.request import urlopen
import time
import errno
import socket
@ -66,7 +53,7 @@ def urljoin (parent, url):
"""
if urlutil.url_is_absolute(url):
return url
return urlparse.urljoin(parent, url)
return urllib.parse.urljoin(parent, url)
def url_norm (url, encoding):
@ -372,14 +359,14 @@ class UrlBase:
self.url = urljoin(self.base_ref, base_url)
elif self.parent_url:
# strip the parent url query and anchor
urlparts = list(urlparse.urlsplit(self.parent_url))
urlparts = list(urllib.parse.urlsplit(self.parent_url))
urlparts[4] = ""
parent_url = urlutil.urlunsplit(urlparts)
self.url = urljoin(parent_url, base_url)
else:
self.url = base_url
# urljoin can unnorm the url path, so norm it again
urlparts = list(urlparse.urlsplit(self.url))
urlparts = list(urllib.parse.urlsplit(self.url))
if urlparts[2]:
urlparts[2] = urlutil.collapse_segments(urlparts[2])
if not urlparts[0].startswith("feed"):
@ -396,7 +383,7 @@ class UrlBase:
Also checks for obfuscated IP addresses.
"""
# check userinfo@host:port syntax
self.userinfo, host = urllib_parse.splituser(self.urlparts[1])
self.userinfo, host = urllib.parse.splituser(self.urlparts[1])
port = urlutil.default_ports.get(self.scheme, 0)
host, port = urlutil.splitport(host, port=port)
if port is None:
@ -676,7 +663,7 @@ class UrlBase:
"""
if self.userinfo:
# URL itself has authentication info
return urllib_parse.splitpasswd(self.userinfo)
return urllib.parse.splitpasswd(self.userinfo)
return self.aggregate.config.get_user_password(self.url)
def add_url (self, url, line=0, column=0, page=0, name="", base=None):

View file

@ -20,12 +20,8 @@ Store metadata and options.
from functools import lru_cache
import os
import re
try: # Python 3
from urllib import parse
from urllib import request
except ImportError: # Python 2
import urlparse as parse
import urllib as request
import urllib.parse
import urllib.request
import shutil
import socket
import _LinkChecker_configdata as configdata
@ -172,7 +168,7 @@ class Configuration (dict):
self["maxrequestspersecond"] = 10
self["maxhttpredirects"] = 10
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
self["proxy"] = request.getproxies()
self["proxy"] = urllib.request.getproxies()
self["sslverify"] = True
self["threads"] = 10
self["timeout"] = 60
@ -317,7 +313,7 @@ class Configuration (dict):
if not url.lower().startswith(("http:", "https:")):
log.warn(LOG_CHECK, _("login URL is not a HTTP URL."))
disable = True
urlparts = parse.urlsplit(url)
urlparts = urllib.parse.urlsplit(url)
if not urlparts[0] or not urlparts[1] or not urlparts[2]:
log.warn(LOG_CHECK, _("login URL is incomplete."))
disable = True

View file

@ -24,10 +24,7 @@ except ImportError:
import requests
import time
try: # Python 3
from urllib import parse
except ImportError:
import urlparse as parse
import urllib.parse
import random
from .. import log, LOG_CHECK, strformat, LinkCheckerError
from ..decorators import synchronized
@ -103,7 +100,7 @@ class Aggregate:
form.data[cgipassword] = password
for key, value in self.config["loginextrafields"].items():
form.data[key] = value
formurl = parse.urljoin(url, form.url)
formurl = urllib.parse.urljoin(url, form.url)
log.debug(LOG_CHECK, "Posting login data to %s", formurl)
response = session.post(formurl, data=form.data)
response.raise_for_status()

View file

@ -17,17 +17,14 @@
Functions used by the WSGI script.
"""
from html import escape as html_escape
import html
import os
import threading
import locale
import re
import time
try:
import urlparse
except ImportError:
# Python 3
from urllib import parse as urlparse
import urllib.parse
from . import configuration, strformat, checker, director, get_link_pat, \
init_i18n, url as urlutil
from .decorators import synchronized
@ -54,7 +51,7 @@ def application(environ, start_response):
request_body = environ['wsgi.input'].read(request_body_size)
else:
request_body = environ['wsgi.input'].read()
form = urlparse.parse_qs(request_body.decode(HTML_ENCODING))
form = urllib.parse.parse_qs(request_body.decode(HTML_ENCODING))
status = '200 OK'
start_response(status, get_response_headers())
@ -188,7 +185,7 @@ def get_configuration(form, out):
def get_host_name (form):
"""Return host name of given URL."""
return urlparse.urlparse(formvalue(form, "url"))[1]
return urllib.parse.urlparse(formvalue(form, "url"))[1]
def checkform (form, env):
@ -264,4 +261,4 @@ contains only these characters: <code>A-Za-z0-9./_~-</code><br/><br/>
Errors are logged.
</blockquote>
</body>
</html>""") % html_escape(why)
</html>""") % html.escape(why)

View file

@ -17,7 +17,7 @@
A HTML logger.
"""
from html import escape as html_escape
import html
import os
import time
@ -174,12 +174,12 @@ class HtmlLogger (_Logger):
self.writeln("<tr>")
self.writeln('<td class="url">%s</td>' % self.part("url"))
self.write('<td class="url">')
self.write("`%s'" % html_escape(url_data.base_url))
self.write("`%s'" % html.escape(url_data.base_url))
self.writeln("</td></tr>")
def write_name (self, url_data):
"""Write url_data.name."""
args = (self.part("name"), html_escape(url_data.name))
args = (self.part("name"), html.escape(url_data.name))
self.writeln("<tr><td>%s</td><td>`%s'</td></tr>" % args)
def write_parent (self, url_data):
@ -187,7 +187,7 @@ class HtmlLogger (_Logger):
self.write("<tr><td>"+self.part("parenturl")+
'</td><td><a target="top" href="'+
url_data.parent_url+'">'+
html_escape(url_data.parent_url)+"</a>")
html.escape(url_data.parent_url)+"</a>")
if url_data.line is not None:
self.write(_(", line %d") % url_data.line)
if url_data.column is not None:
@ -206,13 +206,13 @@ class HtmlLogger (_Logger):
def write_base (self, url_data):
"""Write url_data.base_ref."""
self.writeln("<tr><td>"+self.part("base")+"</td><td>"+
html_escape(url_data.base_ref)+"</td></tr>")
html.escape(url_data.base_ref)+"</td></tr>")
def write_real (self, url_data):
"""Write url_data.url."""
self.writeln("<tr><td>"+self.part("realurl")+"</td><td>"+
'<a target="top" href="'+url_data.url+
'">'+html_escape(url_data.url)+"</a></td></tr>")
'">'+html.escape(url_data.url)+"</a></td></tr>")
def write_dltime (self, url_data):
"""Write url_data.dltime."""
@ -234,20 +234,20 @@ class HtmlLogger (_Logger):
def write_info (self, url_data):
"""Write url_data.info."""
sep = "<br/>"+os.linesep
text = sep.join(html_escape(x) for x in url_data.info)
text = sep.join(html.escape(x) for x in url_data.info)
self.writeln('<tr><td valign="top">' + self.part("info")+
"</td><td>"+text+"</td></tr>")
def write_modified(self, url_data):
"""Write url_data.modified."""
text = html_escape(self.format_modified(url_data.modified))
text = html.escape(self.format_modified(url_data.modified))
self.writeln('<tr><td valign="top">' + self.part("modified") +
"</td><td>"+text+"</td></tr>")
def write_warning (self, url_data):
"""Write url_data.warnings."""
sep = "<br/>"+os.linesep
text = sep.join(html_escape(x[1]) for x in url_data.warnings)
text = sep.join(html.escape(x[1]) for x in url_data.warnings)
self.writeln('<tr><td class="warning" '+
'valign="top">' + self.part("warning") +
'</td><td class="warning">' + text + "</td></tr>")
@ -258,14 +258,14 @@ class HtmlLogger (_Logger):
self.write('<tr><td class="valid">')
self.write(self.part("result"))
self.write('</td><td class="valid">')
self.write(html_escape(_("Valid")))
self.write(html.escape(_("Valid")))
else:
self.write('<tr><td class="error">')
self.write(self.part("result"))
self.write('</td><td class="error">')
self.write(html_escape(_("Error")))
self.write(html.escape(_("Error")))
if url_data.result:
self.write(": "+html_escape(url_data.result))
self.write(": "+html.escape(url_data.result))
self.writeln("</td></tr>")
def write_stats (self):

View file

@ -16,7 +16,7 @@
"""
Check HTML anchors
"""
from urllib import parse
import urllib.parse
from . import _ContentPlugin
from .. import log, LOG_PLUGIN
@ -48,7 +48,7 @@ class AnchorCheck(_ContentPlugin):
A warning is logged and True is returned if the anchor is not found.
"""
log.debug(LOG_PLUGIN, "checking anchor %r in %s", url_data.anchor, self.anchors)
if any(x for x in self.anchors if parse.quote(x[0]) == url_data.anchor):
if any(x for x in self.anchors if urllib.parse.quote(x[0]) == url_data.anchor):
return
if self.anchors:
anchornames = sorted(set("`%s'" % x[0] for x in self.anchors))

View file

@ -19,15 +19,8 @@ Robots.txt parser.
The robots.txt Exclusion Protocol is implemented as specified in
http://www.robotstxt.org/wc/norobots-rfc.html
"""
try: # Python 3
from urllib import parse
except ImportError: # Python 2
import urllib as parse
try: # Python 3
from urllib.parse import urlparse
except ImportError: # Python 2
from urlparse import urlparse
import time
import urllib.parse
import requests
@ -84,7 +77,7 @@ class RobotFileParser:
def set_url (self, url):
"""Set the URL referring to a robots.txt file."""
self.url = url
self.host, self.path = urlparse(url)[1:3]
self.host, self.path = urllib.parse.urlparse(url)[1:3]
def read (self):
"""Read the robots.txt URL and feeds it to the parser."""
@ -168,7 +161,7 @@ class RobotFileParser:
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
line[1] = parse.unquote(line[1].strip(), self.encoding)
line[1] = urllib.parse.unquote(line[1].strip(), self.encoding)
if line[0] == "user-agent":
if state == 2:
log.debug(LOG_CHECK, "%r line %d: missing blank line before user-agent directive", self.url, linenumber)
@ -236,7 +229,7 @@ class RobotFileParser:
return True
# search for given user agent matches
# the first match counts
url = parse.quote(urlparse(parse.unquote(url))[2]) or "/"
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
@ -282,7 +275,7 @@ class RuleLine:
# an empty value means allow all
allowance = True
path = '/'
self.path = parse.quote(path)
self.path = urllib.parse.quote(path)
self.allowance = allowance
def applies_to (self, path):

View file

@ -32,11 +32,7 @@ import codecs
import os
import math
import time
try:
import urlparse
except ImportError:
# Python 3
from urllib import parse as urlparse
import urllib.parse
import locale
import pydoc
from . import i18n
@ -95,8 +91,8 @@ def is_encoding (text):
def url_unicode_split (url):
"""Like urlparse.urlsplit(), but always returning unicode parts."""
return [unicode_safe(s) for s in urlparse.urlsplit(url)]
"""Like urllib.parse.urlsplit(), but always returning unicode parts."""
return [unicode_safe(s) for s in urllib.parse.urlsplit(url)]
def unquote (s, matching=False):

View file

@ -19,12 +19,7 @@ Functions for parsing and matching URL strings.
import os
import re
try: # Python 3
from urllib import parse
from urllib import parse as urlparse
except ImportError: # Python 2
import urllib as parse
import urlparse
import urllib.parse
import requests
from builtins import str as str_text
@ -32,8 +27,8 @@ from builtins import str as str_text
from . import log, LOG_CHECK
for scheme in ('ldap', 'irc'):
if scheme not in urlparse.uses_netloc:
urlparse.uses_netloc.append(scheme)
if scheme not in urllib.parse.uses_netloc:
urllib.parse.uses_netloc.append(scheme)
# The character set to encode non-ASCII characters in a URL. See also
# http://tools.ietf.org/html/rfc2396#section-2.1
@ -164,9 +159,9 @@ def parse_qsl (qs, encoding, keep_blank_values=0, strict_parsing=0):
else:
continue
if nv[1] or keep_blank_values:
name = parse.unquote(nv[0].replace('+', ' '), encoding=encoding)
name = urllib.parse.unquote(nv[0].replace('+', ' '), encoding=encoding)
if nv[1]:
value = parse.unquote(nv[1].replace('+', ' '), encoding=encoding)
value = urllib.parse.unquote(nv[1].replace('+', ' '), encoding=encoding)
else:
value = nv[1]
r.append((name, value, sep))
@ -191,12 +186,12 @@ def idna_encode (host):
def url_fix_host (urlparts, encoding):
"""Unquote and fix hostname. Returns is_idn."""
if not urlparts[1]:
urlparts[2] = parse.unquote(urlparts[2], encoding=encoding)
urlparts[2] = urllib.parse.unquote(urlparts[2], encoding=encoding)
return False
userpass, netloc = parse.splituser(urlparts[1])
userpass, netloc = urllib.parse.splituser(urlparts[1])
if userpass:
userpass = parse.unquote(userpass, encoding=encoding)
netloc, is_idn = idna_encode(parse.unquote(netloc, encoding=encoding).lower())
userpass = urllib.parse.unquote(userpass, encoding=encoding)
netloc, is_idn = idna_encode(urllib.parse.unquote(netloc, encoding=encoding).lower())
# a leading backslash in path causes urlsplit() to add the
# path components up to the first slash to host
# try to find this case...
@ -207,7 +202,7 @@ def url_fix_host (urlparts, encoding):
if not urlparts[2] or urlparts[2] == '/':
urlparts[2] = comps
else:
urlparts[2] = "%s%s" % (comps, parse.unquote(urlparts[2], encoding=encoding))
urlparts[2] = "%s%s" % (comps, urllib.parse.unquote(urlparts[2], encoding=encoding))
netloc = netloc[:i]
else:
# a leading ? in path causes urlsplit() to add the query to the
@ -216,7 +211,7 @@ def url_fix_host (urlparts, encoding):
if i != -1:
netloc, urlparts[3] = netloc.split('?', 1)
# path
urlparts[2] = parse.unquote(urlparts[2], encoding=encoding)
urlparts[2] = urllib.parse.unquote(urlparts[2], encoding=encoding)
if userpass:
# append AT for easy concatenation
userpass += "@"
@ -266,9 +261,9 @@ def url_parse_query (query, encoding):
append = '?'+url_parse_query(rest, encoding=encoding)+append
l = []
for k, v, sep in parse_qsl(query, keep_blank_values=True, encoding=encoding):
k = parse.quote(k, safe='/-:,;')
k = urllib.parse.quote(k, safe='/-:,;')
if v:
v = parse.quote(v, safe='/-:,;')
v = urllib.parse.quote(v, safe='/-:,;')
l.append("%s=%s%s" % (k, v, sep))
elif v is None:
l.append("%s%s" % (k, sep))
@ -279,12 +274,12 @@ def url_parse_query (query, encoding):
def urlunsplit (urlparts):
"""Same as urlparse.urlunsplit but with extra UNC path handling
"""Same as urllib.parse.urlunsplit but with extra UNC path handling
for Windows OS."""
res = urlparse.urlunsplit(urlparts)
res = urllib.parse.urlunsplit(urlparts)
if os.name == 'nt' and urlparts[0] == 'file' and '|' not in urlparts[2]:
# UNC paths must have 4 slashes: 'file:////server/path'
# Depending on the path in urlparts[2], urlparse.urlunsplit()
# Depending on the path in urlparts[2], urllib.parse.urlunsplit()
# left only two or three slashes. This is fixed below
repl = 'file://' if urlparts[2].startswith('//') else 'file:/'
res = res.replace('file:', repl)
@ -298,9 +293,9 @@ def url_norm (url, encoding):
@return: (normed url, idna flag)
@rtype: tuple of length two
"""
urlparts = list(urlparse.urlsplit(url))
urlparts = list(urllib.parse.urlsplit(url))
# scheme
urlparts[0] = parse.unquote(urlparts[0], encoding=encoding).lower()
urlparts[0] = urllib.parse.unquote(urlparts[0], encoding=encoding).lower()
# mailto: urlsplit is broken
if urlparts[0] == 'mailto':
url_fix_mailto_urlsplit(urlparts)
@ -308,7 +303,7 @@ def url_norm (url, encoding):
is_idn = url_fix_host(urlparts, encoding)
# query
urlparts[3] = url_parse_query(urlparts[3], encoding=encoding)
if urlparts[0] in urlparse.uses_relative:
if urlparts[0] in urllib.parse.uses_relative:
# URL has a hierarchical path we should norm
if not urlparts[2]:
# Empty path is allowed if both query and fragment are also empty.
@ -320,14 +315,14 @@ def url_norm (url, encoding):
# fix redundant path parts
urlparts[2] = collapse_segments(urlparts[2])
# anchor
urlparts[4] = parse.unquote(urlparts[4], encoding=encoding)
urlparts[4] = urllib.parse.unquote(urlparts[4], encoding=encoding)
# quote parts again
urlparts[0] = parse.quote(urlparts[0]) # scheme
urlparts[1] = parse.quote(urlparts[1], safe='@:') # host
urlparts[2] = parse.quote(urlparts[2], safe=_nopathquote_chars) # path
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host
urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path
if not urlparts[0].startswith("feed"):
urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path
urlparts[4] = parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
res = urlunsplit(urlparts)
if url.endswith('#') and not urlparts[4]:
# re-append trailing empty fragment
@ -380,28 +375,28 @@ def url_quote (url, encoding):
"""Quote given URL."""
if not url_is_absolute(url):
return document_quote(url)
urlparts = list(urlparse.urlsplit(url))
urlparts[0] = parse.quote(urlparts[0]) # scheme
urlparts[1] = parse.quote(urlparts[1], safe=':') # host
urlparts[2] = parse.quote(urlparts[2], safe='/=,') # path
urlparts[3] = parse.quote(urlparts[3], safe='&=,') # query
urlparts = list(urllib.parse.urlsplit(url))
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host
urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path
urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query
l = []
for k, v, sep in parse_qsl(urlparts[3], encoding=encoding, keep_blank_values=True): # query
k = parse.quote(k, safe='/-:,;')
k = urllib.parse.quote(k, safe='/-:,;')
if v:
v = parse.quote(v, safe='/-:,;')
v = urllib.parse.quote(v, safe='/-:,;')
l.append("%s=%s%s" % (k, v, sep))
else:
l.append("%s%s" % (k, sep))
urlparts[3] = ''.join(l)
urlparts[4] = parse.quote(urlparts[4]) # anchor
urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor
return urlunsplit(urlparts)
def document_quote (document):
"""Quote given document."""
doc, query = parse.splitquery(document)
doc = parse.quote(doc, safe='/=,')
doc, query = urllib.parse.splitquery(document)
doc = urllib.parse.quote(doc, safe='/=,')
if query:
return "%s?%s" % (doc, query)
return doc
@ -451,8 +446,8 @@ def url_split (url):
hostname is always lowercased.
Precondition: url is syntactically correct URI (eg has no whitespace)
"""
scheme, netloc = parse.splittype(url)
host, document = parse.splithost(netloc)
scheme, netloc = urllib.parse.splittype(url)
host, document = urllib.parse.splithost(netloc)
port = default_ports.get(scheme, 0)
if host:
host = host.lower()

View file

@ -20,7 +20,7 @@ Analyze a memory dump by the meliae module.
import sys
import os
import codecs
from html import escape as html_escape
import html
from linkcheck import strformat
def main (filename):
@ -107,7 +107,7 @@ def write_html_obj(fp, obj, objs):
if obj.value is None:
value = "None"
else:
value = html_escape(str(obj.value))
value = html.escape(str(obj.value))
attrs = dict(
address=obj.address,
size=strformat.strsize(obj.size),

View file

@ -17,17 +17,14 @@
Define http test support classes for LinkChecker tests.
"""
from html import escape as html_escape
import html
from http.server import CGIHTTPRequestHandler, SimpleHTTPRequestHandler, HTTPServer
from http.client import HTTPConnection, HTTPSConnection
import os.path
import ssl
import time
import threading
try:
from urllib import parse as urllib_parse
except ImportError:
import urllib as urllib_parse
import urllib.parse
from io import BytesIO
from . import LinkCheckTest
from .. import get_file
@ -136,7 +133,7 @@ class NoQueryHttpRequestHandler (StoppableHttpRequestHandler):
displayname = linkname = name
list_item = (
'<li><a href="%s">%s</a>\n'
% (urllib_parse.quote(linkname), html_escape(displayname))
% (urllib.parse.quote(linkname), html.escape(displayname))
)
f.write(list_item.encode())
f.write(b"</ul>\n<hr>\n</body>\n</html>\n")

View file

@ -17,10 +17,7 @@
Test cgi form routines.
"""
import unittest
try: # Python 3
from urllib import parse as urllib_parse
except ImportError: # Python 2
import urllib as urllib_parse
import urllib.parse
from io import BytesIO
from wsgiref.util import setup_testing_defaults
from linkcheck.lc_cgi import checkform, checklink, LCFormError, application
@ -59,7 +56,7 @@ class TestWsgi (unittest.TestCase):
def test_application (self):
form = dict(url="http://www.example.com/", level="0")
formdata = urllib_parse.urlencode(form)
formdata = urllib.parse.urlencode(form)
formdata = formdata.encode('ascii')
environ = {'wsgi.input': BytesIO(formdata)}
setup_testing_defaults(environ)