mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-30 11:04:50 +00:00
Merge pull request #382 from cjmayo/tidyten5
Make urllib imports and html.escape Python 3 only
This commit is contained in:
commit
f3eb787014
16 changed files with 103 additions and 181 deletions
|
|
@ -18,11 +18,9 @@ Main functions for link checking.
|
|||
"""
|
||||
|
||||
import os
|
||||
from html import escape as html_escape
|
||||
try: # Python 3
|
||||
from urllib import parse as urlparse
|
||||
except ImportError:
|
||||
import urllib as urlparse
|
||||
import html
|
||||
import urllib.parse
|
||||
|
||||
from .. import strformat, url as urlutil, log, LOG_CHECK
|
||||
|
||||
MAX_FILESIZE = 1024*1024*10 # 10MB
|
||||
|
|
@ -165,9 +163,9 @@ def get_index_html (urls):
|
|||
"""
|
||||
lines = ["<html>", "<body>"]
|
||||
for entry in urls:
|
||||
name = html_escape(entry)
|
||||
name = html.escape(entry)
|
||||
try:
|
||||
url = html_escape(urlparse.quote(entry))
|
||||
url = html.escape(urllib.parse.quote(entry))
|
||||
except KeyError:
|
||||
# Some unicode entries raise KeyError.
|
||||
url = name
|
||||
|
|
|
|||
|
|
@ -19,20 +19,8 @@ Handle local file: links.
|
|||
|
||||
import re
|
||||
import os
|
||||
try:
|
||||
import urlparse
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib import parse as urlparse
|
||||
try: # Python 3
|
||||
from urllib import request as urlrequest
|
||||
except ImportError:
|
||||
import urllib as urlrequest
|
||||
try:
|
||||
from urllib2 import urlopen
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib.request import urlopen
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from builtins import str as str_text
|
||||
from datetime import datetime
|
||||
|
||||
|
|
@ -82,7 +70,7 @@ def get_os_filename (path):
|
|||
"""Return filesystem path for given URL path."""
|
||||
if os.name == 'nt':
|
||||
path = prepare_urlpath_for_nt(path)
|
||||
res = urlrequest.url2pathname(fileutil.path_safe(path))
|
||||
res = urllib.request.url2pathname(fileutil.path_safe(path))
|
||||
if os.name == 'nt' and res.endswith(':') and len(res) == 2:
|
||||
# Work around http://bugs.python.org/issue11474
|
||||
res += os.sep
|
||||
|
|
@ -153,7 +141,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
from .urlbase import url_norm
|
||||
# norm base url - can raise UnicodeError from url.idna_encode()
|
||||
base_url, is_idn = url_norm(self.base_url, self.encoding)
|
||||
urlparts = list(urlparse.urlsplit(base_url))
|
||||
urlparts = list(urllib.parse.urlsplit(base_url))
|
||||
# ignore query part for filesystem urls
|
||||
urlparts[3] = ''
|
||||
self.base_url = urlutil.urlunsplit(urlparts)
|
||||
|
|
@ -189,7 +177,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
self.set_result(_("directory"))
|
||||
else:
|
||||
url = fileutil.path_safe(self.url)
|
||||
self.url_connection = urlopen(url)
|
||||
self.url_connection = urllib.request.urlopen(url)
|
||||
self.check_case_sensitivity()
|
||||
|
||||
def check_case_sensitivity (self):
|
||||
|
|
|
|||
|
|
@ -18,11 +18,7 @@ Handle for mailto: links.
|
|||
"""
|
||||
|
||||
import re
|
||||
try:
|
||||
import urlparse
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib import parse as urlparse
|
||||
import urllib.parse
|
||||
from email._parseaddr import AddressList
|
||||
|
||||
from . import urlbase
|
||||
|
|
@ -94,7 +90,7 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
Stores parsed addresses in the self.addresses set.
|
||||
"""
|
||||
# cut off leading mailto: and unquote
|
||||
url = urlparse.unquote(self.base_url[7:], self.encoding)
|
||||
url = urllib.parse.unquote(self.base_url[7:], self.encoding)
|
||||
# search for cc, bcc, to and store in headers
|
||||
mode = 0 # 0=default, 1=quote, 2=esc
|
||||
quote = None
|
||||
|
|
@ -118,11 +114,11 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
if i < (len(url) - 1):
|
||||
self.addresses.update(getaddresses(url[:i]))
|
||||
try:
|
||||
headers = urlparse.parse_qs(url[(i+1):], strict_parsing=True)
|
||||
headers = urllib.parse.parse_qs(url[(i+1):], strict_parsing=True)
|
||||
for key, vals in headers.items():
|
||||
if key.lower() in EMAIL_CGI_ADDRESS:
|
||||
# Only the first header value is added
|
||||
self.addresses.update(getaddresses(urlparse.unquote(vals[0], self.encoding)))
|
||||
self.addresses.update(getaddresses(urllib.parse.unquote(vals[0], self.encoding)))
|
||||
if key.lower() == EMAIL_CGI_SUBJECT:
|
||||
self.subject = vals[0]
|
||||
except ValueError as err:
|
||||
|
|
|
|||
|
|
@ -16,14 +16,8 @@
|
|||
"""
|
||||
Mixin class for URLs that can be fetched over a proxy.
|
||||
"""
|
||||
try: # Python 3
|
||||
from urllib import parse
|
||||
from urllib import request
|
||||
from urllib.parse import splitport
|
||||
except ImportError:
|
||||
from urllib import splitport
|
||||
import urllib as request
|
||||
import urlparse as parse
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import os
|
||||
from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil, httputil
|
||||
|
||||
|
|
@ -41,7 +35,7 @@ class ProxySupport:
|
|||
self.proxyauth = None
|
||||
if not self.proxy:
|
||||
return
|
||||
proxyurl = parse.urlparse(self.proxy)
|
||||
proxyurl = urllib.parse.urlparse(self.proxy)
|
||||
self.proxytype = proxyurl.scheme
|
||||
if self.proxytype not in ('http', 'https'):
|
||||
# Note that invalid proxies might raise TypeError in urllib2,
|
||||
|
|
@ -68,11 +62,11 @@ class ProxySupport:
|
|||
|
||||
def ignore_proxy_host (self):
|
||||
"""Check if self.host is in the $no_proxy ignore list."""
|
||||
if request.proxy_bypass(self.host):
|
||||
if urllib.request.proxy_bypass(self.host):
|
||||
return True
|
||||
no_proxy = os.environ.get("no_proxy")
|
||||
if no_proxy:
|
||||
entries = [parse_host_port(x) for x in no_proxy.split(",")]
|
||||
entries = [urlutil.splitport(x.strip()) for x in no_proxy.split(",")]
|
||||
for host, port in entries:
|
||||
if host.lower() == self.host and port == self.port:
|
||||
return True
|
||||
|
|
@ -93,12 +87,3 @@ class ProxySupport:
|
|||
host = self.host
|
||||
port = self.port
|
||||
return (scheme, host, port)
|
||||
|
||||
|
||||
def parse_host_port (host_port):
|
||||
"""Parse a host:port string into separate components."""
|
||||
host, port = splitport(host_port.strip())
|
||||
if port is not None:
|
||||
if urlutil.is_numeric_port(port):
|
||||
port = int(port)
|
||||
return host, port
|
||||
|
|
|
|||
|
|
@ -18,21 +18,8 @@ Base URL handler.
|
|||
"""
|
||||
import sys
|
||||
import os
|
||||
try:
|
||||
import urlparse
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib import parse as urlparse
|
||||
try: # Python 3
|
||||
from urllib import parse as urllib_parse
|
||||
except ImportError:
|
||||
import urllib as urllib_parse
|
||||
try:
|
||||
from urllib2 import urlopen
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib.request import urlopen
|
||||
import urllib
|
||||
import urllib.parse
|
||||
from urllib.request import urlopen
|
||||
import time
|
||||
import errno
|
||||
import socket
|
||||
|
|
@ -66,7 +53,7 @@ def urljoin (parent, url):
|
|||
"""
|
||||
if urlutil.url_is_absolute(url):
|
||||
return url
|
||||
return urlparse.urljoin(parent, url)
|
||||
return urllib.parse.urljoin(parent, url)
|
||||
|
||||
|
||||
def url_norm (url, encoding):
|
||||
|
|
@ -372,14 +359,14 @@ class UrlBase:
|
|||
self.url = urljoin(self.base_ref, base_url)
|
||||
elif self.parent_url:
|
||||
# strip the parent url query and anchor
|
||||
urlparts = list(urlparse.urlsplit(self.parent_url))
|
||||
urlparts = list(urllib.parse.urlsplit(self.parent_url))
|
||||
urlparts[4] = ""
|
||||
parent_url = urlutil.urlunsplit(urlparts)
|
||||
self.url = urljoin(parent_url, base_url)
|
||||
else:
|
||||
self.url = base_url
|
||||
# urljoin can unnorm the url path, so norm it again
|
||||
urlparts = list(urlparse.urlsplit(self.url))
|
||||
urlparts = list(urllib.parse.urlsplit(self.url))
|
||||
if urlparts[2]:
|
||||
urlparts[2] = urlutil.collapse_segments(urlparts[2])
|
||||
if not urlparts[0].startswith("feed"):
|
||||
|
|
@ -396,7 +383,7 @@ class UrlBase:
|
|||
Also checks for obfuscated IP addresses.
|
||||
"""
|
||||
# check userinfo@host:port syntax
|
||||
self.userinfo, host = urllib_parse.splituser(self.urlparts[1])
|
||||
self.userinfo, host = urllib.parse.splituser(self.urlparts[1])
|
||||
port = urlutil.default_ports.get(self.scheme, 0)
|
||||
host, port = urlutil.splitport(host, port=port)
|
||||
if port is None:
|
||||
|
|
@ -676,7 +663,7 @@ class UrlBase:
|
|||
"""
|
||||
if self.userinfo:
|
||||
# URL itself has authentication info
|
||||
return urllib_parse.splitpasswd(self.userinfo)
|
||||
return urllib.parse.splitpasswd(self.userinfo)
|
||||
return self.aggregate.config.get_user_password(self.url)
|
||||
|
||||
def add_url (self, url, line=0, column=0, page=0, name="", base=None):
|
||||
|
|
|
|||
|
|
@ -20,12 +20,8 @@ Store metadata and options.
|
|||
from functools import lru_cache
|
||||
import os
|
||||
import re
|
||||
try: # Python 3
|
||||
from urllib import parse
|
||||
from urllib import request
|
||||
except ImportError: # Python 2
|
||||
import urlparse as parse
|
||||
import urllib as request
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import shutil
|
||||
import socket
|
||||
import _LinkChecker_configdata as configdata
|
||||
|
|
@ -172,7 +168,7 @@ class Configuration (dict):
|
|||
self["maxrequestspersecond"] = 10
|
||||
self["maxhttpredirects"] = 10
|
||||
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
|
||||
self["proxy"] = request.getproxies()
|
||||
self["proxy"] = urllib.request.getproxies()
|
||||
self["sslverify"] = True
|
||||
self["threads"] = 10
|
||||
self["timeout"] = 60
|
||||
|
|
@ -317,7 +313,7 @@ class Configuration (dict):
|
|||
if not url.lower().startswith(("http:", "https:")):
|
||||
log.warn(LOG_CHECK, _("login URL is not a HTTP URL."))
|
||||
disable = True
|
||||
urlparts = parse.urlsplit(url)
|
||||
urlparts = urllib.parse.urlsplit(url)
|
||||
if not urlparts[0] or not urlparts[1] or not urlparts[2]:
|
||||
log.warn(LOG_CHECK, _("login URL is incomplete."))
|
||||
disable = True
|
||||
|
|
|
|||
|
|
@ -24,10 +24,7 @@ except ImportError:
|
|||
|
||||
import requests
|
||||
import time
|
||||
try: # Python 3
|
||||
from urllib import parse
|
||||
except ImportError:
|
||||
import urlparse as parse
|
||||
import urllib.parse
|
||||
import random
|
||||
from .. import log, LOG_CHECK, strformat, LinkCheckerError
|
||||
from ..decorators import synchronized
|
||||
|
|
@ -103,7 +100,7 @@ class Aggregate:
|
|||
form.data[cgipassword] = password
|
||||
for key, value in self.config["loginextrafields"].items():
|
||||
form.data[key] = value
|
||||
formurl = parse.urljoin(url, form.url)
|
||||
formurl = urllib.parse.urljoin(url, form.url)
|
||||
log.debug(LOG_CHECK, "Posting login data to %s", formurl)
|
||||
response = session.post(formurl, data=form.data)
|
||||
response.raise_for_status()
|
||||
|
|
|
|||
|
|
@ -17,17 +17,14 @@
|
|||
Functions used by the WSGI script.
|
||||
"""
|
||||
|
||||
from html import escape as html_escape
|
||||
import html
|
||||
import os
|
||||
import threading
|
||||
import locale
|
||||
import re
|
||||
import time
|
||||
try:
|
||||
import urlparse
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib import parse as urlparse
|
||||
import urllib.parse
|
||||
|
||||
from . import configuration, strformat, checker, director, get_link_pat, \
|
||||
init_i18n, url as urlutil
|
||||
from .decorators import synchronized
|
||||
|
|
@ -54,7 +51,7 @@ def application(environ, start_response):
|
|||
request_body = environ['wsgi.input'].read(request_body_size)
|
||||
else:
|
||||
request_body = environ['wsgi.input'].read()
|
||||
form = urlparse.parse_qs(request_body.decode(HTML_ENCODING))
|
||||
form = urllib.parse.parse_qs(request_body.decode(HTML_ENCODING))
|
||||
|
||||
status = '200 OK'
|
||||
start_response(status, get_response_headers())
|
||||
|
|
@ -188,7 +185,7 @@ def get_configuration(form, out):
|
|||
|
||||
def get_host_name (form):
|
||||
"""Return host name of given URL."""
|
||||
return urlparse.urlparse(formvalue(form, "url"))[1]
|
||||
return urllib.parse.urlparse(formvalue(form, "url"))[1]
|
||||
|
||||
|
||||
def checkform (form, env):
|
||||
|
|
@ -264,4 +261,4 @@ contains only these characters: <code>A-Za-z0-9./_~-</code><br/><br/>
|
|||
Errors are logged.
|
||||
</blockquote>
|
||||
</body>
|
||||
</html>""") % html_escape(why)
|
||||
</html>""") % html.escape(why)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
A HTML logger.
|
||||
"""
|
||||
|
||||
from html import escape as html_escape
|
||||
import html
|
||||
import os
|
||||
import time
|
||||
|
||||
|
|
@ -174,12 +174,12 @@ class HtmlLogger (_Logger):
|
|||
self.writeln("<tr>")
|
||||
self.writeln('<td class="url">%s</td>' % self.part("url"))
|
||||
self.write('<td class="url">')
|
||||
self.write("`%s'" % html_escape(url_data.base_url))
|
||||
self.write("`%s'" % html.escape(url_data.base_url))
|
||||
self.writeln("</td></tr>")
|
||||
|
||||
def write_name (self, url_data):
|
||||
"""Write url_data.name."""
|
||||
args = (self.part("name"), html_escape(url_data.name))
|
||||
args = (self.part("name"), html.escape(url_data.name))
|
||||
self.writeln("<tr><td>%s</td><td>`%s'</td></tr>" % args)
|
||||
|
||||
def write_parent (self, url_data):
|
||||
|
|
@ -187,7 +187,7 @@ class HtmlLogger (_Logger):
|
|||
self.write("<tr><td>"+self.part("parenturl")+
|
||||
'</td><td><a target="top" href="'+
|
||||
url_data.parent_url+'">'+
|
||||
html_escape(url_data.parent_url)+"</a>")
|
||||
html.escape(url_data.parent_url)+"</a>")
|
||||
if url_data.line is not None:
|
||||
self.write(_(", line %d") % url_data.line)
|
||||
if url_data.column is not None:
|
||||
|
|
@ -206,13 +206,13 @@ class HtmlLogger (_Logger):
|
|||
def write_base (self, url_data):
|
||||
"""Write url_data.base_ref."""
|
||||
self.writeln("<tr><td>"+self.part("base")+"</td><td>"+
|
||||
html_escape(url_data.base_ref)+"</td></tr>")
|
||||
html.escape(url_data.base_ref)+"</td></tr>")
|
||||
|
||||
def write_real (self, url_data):
|
||||
"""Write url_data.url."""
|
||||
self.writeln("<tr><td>"+self.part("realurl")+"</td><td>"+
|
||||
'<a target="top" href="'+url_data.url+
|
||||
'">'+html_escape(url_data.url)+"</a></td></tr>")
|
||||
'">'+html.escape(url_data.url)+"</a></td></tr>")
|
||||
|
||||
def write_dltime (self, url_data):
|
||||
"""Write url_data.dltime."""
|
||||
|
|
@ -234,20 +234,20 @@ class HtmlLogger (_Logger):
|
|||
def write_info (self, url_data):
|
||||
"""Write url_data.info."""
|
||||
sep = "<br/>"+os.linesep
|
||||
text = sep.join(html_escape(x) for x in url_data.info)
|
||||
text = sep.join(html.escape(x) for x in url_data.info)
|
||||
self.writeln('<tr><td valign="top">' + self.part("info")+
|
||||
"</td><td>"+text+"</td></tr>")
|
||||
|
||||
def write_modified(self, url_data):
|
||||
"""Write url_data.modified."""
|
||||
text = html_escape(self.format_modified(url_data.modified))
|
||||
text = html.escape(self.format_modified(url_data.modified))
|
||||
self.writeln('<tr><td valign="top">' + self.part("modified") +
|
||||
"</td><td>"+text+"</td></tr>")
|
||||
|
||||
def write_warning (self, url_data):
|
||||
"""Write url_data.warnings."""
|
||||
sep = "<br/>"+os.linesep
|
||||
text = sep.join(html_escape(x[1]) for x in url_data.warnings)
|
||||
text = sep.join(html.escape(x[1]) for x in url_data.warnings)
|
||||
self.writeln('<tr><td class="warning" '+
|
||||
'valign="top">' + self.part("warning") +
|
||||
'</td><td class="warning">' + text + "</td></tr>")
|
||||
|
|
@ -258,14 +258,14 @@ class HtmlLogger (_Logger):
|
|||
self.write('<tr><td class="valid">')
|
||||
self.write(self.part("result"))
|
||||
self.write('</td><td class="valid">')
|
||||
self.write(html_escape(_("Valid")))
|
||||
self.write(html.escape(_("Valid")))
|
||||
else:
|
||||
self.write('<tr><td class="error">')
|
||||
self.write(self.part("result"))
|
||||
self.write('</td><td class="error">')
|
||||
self.write(html_escape(_("Error")))
|
||||
self.write(html.escape(_("Error")))
|
||||
if url_data.result:
|
||||
self.write(": "+html_escape(url_data.result))
|
||||
self.write(": "+html.escape(url_data.result))
|
||||
self.writeln("</td></tr>")
|
||||
|
||||
def write_stats (self):
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
"""
|
||||
Check HTML anchors
|
||||
"""
|
||||
from urllib import parse
|
||||
import urllib.parse
|
||||
|
||||
from . import _ContentPlugin
|
||||
from .. import log, LOG_PLUGIN
|
||||
|
|
@ -48,7 +48,7 @@ class AnchorCheck(_ContentPlugin):
|
|||
A warning is logged and True is returned if the anchor is not found.
|
||||
"""
|
||||
log.debug(LOG_PLUGIN, "checking anchor %r in %s", url_data.anchor, self.anchors)
|
||||
if any(x for x in self.anchors if parse.quote(x[0]) == url_data.anchor):
|
||||
if any(x for x in self.anchors if urllib.parse.quote(x[0]) == url_data.anchor):
|
||||
return
|
||||
if self.anchors:
|
||||
anchornames = sorted(set("`%s'" % x[0] for x in self.anchors))
|
||||
|
|
|
|||
|
|
@ -19,15 +19,8 @@ Robots.txt parser.
|
|||
The robots.txt Exclusion Protocol is implemented as specified in
|
||||
http://www.robotstxt.org/wc/norobots-rfc.html
|
||||
"""
|
||||
try: # Python 3
|
||||
from urllib import parse
|
||||
except ImportError: # Python 2
|
||||
import urllib as parse
|
||||
try: # Python 3
|
||||
from urllib.parse import urlparse
|
||||
except ImportError: # Python 2
|
||||
from urlparse import urlparse
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
import requests
|
||||
|
||||
|
|
@ -84,7 +77,7 @@ class RobotFileParser:
|
|||
def set_url (self, url):
|
||||
"""Set the URL referring to a robots.txt file."""
|
||||
self.url = url
|
||||
self.host, self.path = urlparse(url)[1:3]
|
||||
self.host, self.path = urllib.parse.urlparse(url)[1:3]
|
||||
|
||||
def read (self):
|
||||
"""Read the robots.txt URL and feeds it to the parser."""
|
||||
|
|
@ -168,7 +161,7 @@ class RobotFileParser:
|
|||
line = line.split(':', 1)
|
||||
if len(line) == 2:
|
||||
line[0] = line[0].strip().lower()
|
||||
line[1] = parse.unquote(line[1].strip(), self.encoding)
|
||||
line[1] = urllib.parse.unquote(line[1].strip(), self.encoding)
|
||||
if line[0] == "user-agent":
|
||||
if state == 2:
|
||||
log.debug(LOG_CHECK, "%r line %d: missing blank line before user-agent directive", self.url, linenumber)
|
||||
|
|
@ -236,7 +229,7 @@ class RobotFileParser:
|
|||
return True
|
||||
# search for given user agent matches
|
||||
# the first match counts
|
||||
url = parse.quote(urlparse(parse.unquote(url))[2]) or "/"
|
||||
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.allowance(url)
|
||||
|
|
@ -282,7 +275,7 @@ class RuleLine:
|
|||
# an empty value means allow all
|
||||
allowance = True
|
||||
path = '/'
|
||||
self.path = parse.quote(path)
|
||||
self.path = urllib.parse.quote(path)
|
||||
self.allowance = allowance
|
||||
|
||||
def applies_to (self, path):
|
||||
|
|
|
|||
|
|
@ -32,11 +32,7 @@ import codecs
|
|||
import os
|
||||
import math
|
||||
import time
|
||||
try:
|
||||
import urlparse
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from urllib import parse as urlparse
|
||||
import urllib.parse
|
||||
import locale
|
||||
import pydoc
|
||||
from . import i18n
|
||||
|
|
@ -95,8 +91,8 @@ def is_encoding (text):
|
|||
|
||||
|
||||
def url_unicode_split (url):
|
||||
"""Like urlparse.urlsplit(), but always returning unicode parts."""
|
||||
return [unicode_safe(s) for s in urlparse.urlsplit(url)]
|
||||
"""Like urllib.parse.urlsplit(), but always returning unicode parts."""
|
||||
return [unicode_safe(s) for s in urllib.parse.urlsplit(url)]
|
||||
|
||||
|
||||
def unquote (s, matching=False):
|
||||
|
|
|
|||
|
|
@ -19,12 +19,7 @@ Functions for parsing and matching URL strings.
|
|||
|
||||
import os
|
||||
import re
|
||||
try: # Python 3
|
||||
from urllib import parse
|
||||
from urllib import parse as urlparse
|
||||
except ImportError: # Python 2
|
||||
import urllib as parse
|
||||
import urlparse
|
||||
import urllib.parse
|
||||
|
||||
import requests
|
||||
from builtins import str as str_text
|
||||
|
|
@ -32,8 +27,8 @@ from builtins import str as str_text
|
|||
from . import log, LOG_CHECK
|
||||
|
||||
for scheme in ('ldap', 'irc'):
|
||||
if scheme not in urlparse.uses_netloc:
|
||||
urlparse.uses_netloc.append(scheme)
|
||||
if scheme not in urllib.parse.uses_netloc:
|
||||
urllib.parse.uses_netloc.append(scheme)
|
||||
|
||||
# The character set to encode non-ASCII characters in a URL. See also
|
||||
# http://tools.ietf.org/html/rfc2396#section-2.1
|
||||
|
|
@ -164,9 +159,9 @@ def parse_qsl (qs, encoding, keep_blank_values=0, strict_parsing=0):
|
|||
else:
|
||||
continue
|
||||
if nv[1] or keep_blank_values:
|
||||
name = parse.unquote(nv[0].replace('+', ' '), encoding=encoding)
|
||||
name = urllib.parse.unquote(nv[0].replace('+', ' '), encoding=encoding)
|
||||
if nv[1]:
|
||||
value = parse.unquote(nv[1].replace('+', ' '), encoding=encoding)
|
||||
value = urllib.parse.unquote(nv[1].replace('+', ' '), encoding=encoding)
|
||||
else:
|
||||
value = nv[1]
|
||||
r.append((name, value, sep))
|
||||
|
|
@ -191,12 +186,12 @@ def idna_encode (host):
|
|||
def url_fix_host (urlparts, encoding):
|
||||
"""Unquote and fix hostname. Returns is_idn."""
|
||||
if not urlparts[1]:
|
||||
urlparts[2] = parse.unquote(urlparts[2], encoding=encoding)
|
||||
urlparts[2] = urllib.parse.unquote(urlparts[2], encoding=encoding)
|
||||
return False
|
||||
userpass, netloc = parse.splituser(urlparts[1])
|
||||
userpass, netloc = urllib.parse.splituser(urlparts[1])
|
||||
if userpass:
|
||||
userpass = parse.unquote(userpass, encoding=encoding)
|
||||
netloc, is_idn = idna_encode(parse.unquote(netloc, encoding=encoding).lower())
|
||||
userpass = urllib.parse.unquote(userpass, encoding=encoding)
|
||||
netloc, is_idn = idna_encode(urllib.parse.unquote(netloc, encoding=encoding).lower())
|
||||
# a leading backslash in path causes urlsplit() to add the
|
||||
# path components up to the first slash to host
|
||||
# try to find this case...
|
||||
|
|
@ -207,7 +202,7 @@ def url_fix_host (urlparts, encoding):
|
|||
if not urlparts[2] or urlparts[2] == '/':
|
||||
urlparts[2] = comps
|
||||
else:
|
||||
urlparts[2] = "%s%s" % (comps, parse.unquote(urlparts[2], encoding=encoding))
|
||||
urlparts[2] = "%s%s" % (comps, urllib.parse.unquote(urlparts[2], encoding=encoding))
|
||||
netloc = netloc[:i]
|
||||
else:
|
||||
# a leading ? in path causes urlsplit() to add the query to the
|
||||
|
|
@ -216,7 +211,7 @@ def url_fix_host (urlparts, encoding):
|
|||
if i != -1:
|
||||
netloc, urlparts[3] = netloc.split('?', 1)
|
||||
# path
|
||||
urlparts[2] = parse.unquote(urlparts[2], encoding=encoding)
|
||||
urlparts[2] = urllib.parse.unquote(urlparts[2], encoding=encoding)
|
||||
if userpass:
|
||||
# append AT for easy concatenation
|
||||
userpass += "@"
|
||||
|
|
@ -266,9 +261,9 @@ def url_parse_query (query, encoding):
|
|||
append = '?'+url_parse_query(rest, encoding=encoding)+append
|
||||
l = []
|
||||
for k, v, sep in parse_qsl(query, keep_blank_values=True, encoding=encoding):
|
||||
k = parse.quote(k, safe='/-:,;')
|
||||
k = urllib.parse.quote(k, safe='/-:,;')
|
||||
if v:
|
||||
v = parse.quote(v, safe='/-:,;')
|
||||
v = urllib.parse.quote(v, safe='/-:,;')
|
||||
l.append("%s=%s%s" % (k, v, sep))
|
||||
elif v is None:
|
||||
l.append("%s%s" % (k, sep))
|
||||
|
|
@ -279,12 +274,12 @@ def url_parse_query (query, encoding):
|
|||
|
||||
|
||||
def urlunsplit (urlparts):
|
||||
"""Same as urlparse.urlunsplit but with extra UNC path handling
|
||||
"""Same as urllib.parse.urlunsplit but with extra UNC path handling
|
||||
for Windows OS."""
|
||||
res = urlparse.urlunsplit(urlparts)
|
||||
res = urllib.parse.urlunsplit(urlparts)
|
||||
if os.name == 'nt' and urlparts[0] == 'file' and '|' not in urlparts[2]:
|
||||
# UNC paths must have 4 slashes: 'file:////server/path'
|
||||
# Depending on the path in urlparts[2], urlparse.urlunsplit()
|
||||
# Depending on the path in urlparts[2], urllib.parse.urlunsplit()
|
||||
# left only two or three slashes. This is fixed below
|
||||
repl = 'file://' if urlparts[2].startswith('//') else 'file:/'
|
||||
res = res.replace('file:', repl)
|
||||
|
|
@ -298,9 +293,9 @@ def url_norm (url, encoding):
|
|||
@return: (normed url, idna flag)
|
||||
@rtype: tuple of length two
|
||||
"""
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
urlparts = list(urllib.parse.urlsplit(url))
|
||||
# scheme
|
||||
urlparts[0] = parse.unquote(urlparts[0], encoding=encoding).lower()
|
||||
urlparts[0] = urllib.parse.unquote(urlparts[0], encoding=encoding).lower()
|
||||
# mailto: urlsplit is broken
|
||||
if urlparts[0] == 'mailto':
|
||||
url_fix_mailto_urlsplit(urlparts)
|
||||
|
|
@ -308,7 +303,7 @@ def url_norm (url, encoding):
|
|||
is_idn = url_fix_host(urlparts, encoding)
|
||||
# query
|
||||
urlparts[3] = url_parse_query(urlparts[3], encoding=encoding)
|
||||
if urlparts[0] in urlparse.uses_relative:
|
||||
if urlparts[0] in urllib.parse.uses_relative:
|
||||
# URL has a hierarchical path we should norm
|
||||
if not urlparts[2]:
|
||||
# Empty path is allowed if both query and fragment are also empty.
|
||||
|
|
@ -320,14 +315,14 @@ def url_norm (url, encoding):
|
|||
# fix redundant path parts
|
||||
urlparts[2] = collapse_segments(urlparts[2])
|
||||
# anchor
|
||||
urlparts[4] = parse.unquote(urlparts[4], encoding=encoding)
|
||||
urlparts[4] = urllib.parse.unquote(urlparts[4], encoding=encoding)
|
||||
# quote parts again
|
||||
urlparts[0] = parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = parse.quote(urlparts[1], safe='@:') # host
|
||||
urlparts[2] = parse.quote(urlparts[2], safe=_nopathquote_chars) # path
|
||||
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host
|
||||
urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path
|
||||
if not urlparts[0].startswith("feed"):
|
||||
urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path
|
||||
urlparts[4] = parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
|
||||
urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
|
||||
res = urlunsplit(urlparts)
|
||||
if url.endswith('#') and not urlparts[4]:
|
||||
# re-append trailing empty fragment
|
||||
|
|
@ -380,28 +375,28 @@ def url_quote (url, encoding):
|
|||
"""Quote given URL."""
|
||||
if not url_is_absolute(url):
|
||||
return document_quote(url)
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
urlparts[0] = parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = parse.quote(urlparts[1], safe=':') # host
|
||||
urlparts[2] = parse.quote(urlparts[2], safe='/=,') # path
|
||||
urlparts[3] = parse.quote(urlparts[3], safe='&=,') # query
|
||||
urlparts = list(urllib.parse.urlsplit(url))
|
||||
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host
|
||||
urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path
|
||||
urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query
|
||||
l = []
|
||||
for k, v, sep in parse_qsl(urlparts[3], encoding=encoding, keep_blank_values=True): # query
|
||||
k = parse.quote(k, safe='/-:,;')
|
||||
k = urllib.parse.quote(k, safe='/-:,;')
|
||||
if v:
|
||||
v = parse.quote(v, safe='/-:,;')
|
||||
v = urllib.parse.quote(v, safe='/-:,;')
|
||||
l.append("%s=%s%s" % (k, v, sep))
|
||||
else:
|
||||
l.append("%s%s" % (k, sep))
|
||||
urlparts[3] = ''.join(l)
|
||||
urlparts[4] = parse.quote(urlparts[4]) # anchor
|
||||
urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor
|
||||
return urlunsplit(urlparts)
|
||||
|
||||
|
||||
def document_quote (document):
|
||||
"""Quote given document."""
|
||||
doc, query = parse.splitquery(document)
|
||||
doc = parse.quote(doc, safe='/=,')
|
||||
doc, query = urllib.parse.splitquery(document)
|
||||
doc = urllib.parse.quote(doc, safe='/=,')
|
||||
if query:
|
||||
return "%s?%s" % (doc, query)
|
||||
return doc
|
||||
|
|
@ -451,8 +446,8 @@ def url_split (url):
|
|||
hostname is always lowercased.
|
||||
Precondition: url is syntactically correct URI (eg has no whitespace)
|
||||
"""
|
||||
scheme, netloc = parse.splittype(url)
|
||||
host, document = parse.splithost(netloc)
|
||||
scheme, netloc = urllib.parse.splittype(url)
|
||||
host, document = urllib.parse.splithost(netloc)
|
||||
port = default_ports.get(scheme, 0)
|
||||
if host:
|
||||
host = host.lower()
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Analyze a memory dump by the meliae module.
|
|||
import sys
|
||||
import os
|
||||
import codecs
|
||||
from html import escape as html_escape
|
||||
import html
|
||||
from linkcheck import strformat
|
||||
|
||||
def main (filename):
|
||||
|
|
@ -107,7 +107,7 @@ def write_html_obj(fp, obj, objs):
|
|||
if obj.value is None:
|
||||
value = "None"
|
||||
else:
|
||||
value = html_escape(str(obj.value))
|
||||
value = html.escape(str(obj.value))
|
||||
attrs = dict(
|
||||
address=obj.address,
|
||||
size=strformat.strsize(obj.size),
|
||||
|
|
|
|||
|
|
@ -17,17 +17,14 @@
|
|||
Define http test support classes for LinkChecker tests.
|
||||
"""
|
||||
|
||||
from html import escape as html_escape
|
||||
import html
|
||||
from http.server import CGIHTTPRequestHandler, SimpleHTTPRequestHandler, HTTPServer
|
||||
from http.client import HTTPConnection, HTTPSConnection
|
||||
import os.path
|
||||
import ssl
|
||||
import time
|
||||
import threading
|
||||
try:
|
||||
from urllib import parse as urllib_parse
|
||||
except ImportError:
|
||||
import urllib as urllib_parse
|
||||
import urllib.parse
|
||||
from io import BytesIO
|
||||
from . import LinkCheckTest
|
||||
from .. import get_file
|
||||
|
|
@ -136,7 +133,7 @@ class NoQueryHttpRequestHandler (StoppableHttpRequestHandler):
|
|||
displayname = linkname = name
|
||||
list_item = (
|
||||
'<li><a href="%s">%s</a>\n'
|
||||
% (urllib_parse.quote(linkname), html_escape(displayname))
|
||||
% (urllib.parse.quote(linkname), html.escape(displayname))
|
||||
)
|
||||
f.write(list_item.encode())
|
||||
f.write(b"</ul>\n<hr>\n</body>\n</html>\n")
|
||||
|
|
|
|||
|
|
@ -17,10 +17,7 @@
|
|||
Test cgi form routines.
|
||||
"""
|
||||
import unittest
|
||||
try: # Python 3
|
||||
from urllib import parse as urllib_parse
|
||||
except ImportError: # Python 2
|
||||
import urllib as urllib_parse
|
||||
import urllib.parse
|
||||
from io import BytesIO
|
||||
from wsgiref.util import setup_testing_defaults
|
||||
from linkcheck.lc_cgi import checkform, checklink, LCFormError, application
|
||||
|
|
@ -59,7 +56,7 @@ class TestWsgi (unittest.TestCase):
|
|||
|
||||
def test_application (self):
|
||||
form = dict(url="http://www.example.com/", level="0")
|
||||
formdata = urllib_parse.urlencode(form)
|
||||
formdata = urllib.parse.urlencode(form)
|
||||
formdata = formdata.encode('ascii')
|
||||
environ = {'wsgi.input': BytesIO(formdata)}
|
||||
setup_testing_defaults(environ)
|
||||
|
|
|
|||
Loading…
Reference in a new issue