Merge pull request #486 from cjmayo/url

Remove unused code from url.py
2026-04-19 05:41:00 +00:00 · 2020-08-26 19:28:50 +01:00 · 2020-08-26 19:28:50 +01:00 · 7dfba766a9
commit 7dfba766a9
parent 737c61cd67 d7efa20d33
2 changed files with 2 additions and 210 deletions
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -21,10 +21,6 @@ import os
 import re
 import urllib.parse

-import requests
-
-from . import log, LOG_CHECK
-
 for scheme in ('ldap', 'irc'):
    if scheme not in urllib.parse.uses_netloc:
        urllib.parse.uses_netloc.append(scheme)
@ -37,13 +33,6 @@ for scheme in ('ldap', 'irc'):
 # http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
 url_encoding = "utf-8"

-
-# constants defining url part indexes
-SCHEME = 0
-HOSTNAME = DOMAIN = 1
-PORT = 2
-DOCUMENT = 3
-
 default_ports = {
    'http': 80,
    'https': 443,
@ -82,14 +71,8 @@ safe_url_pattern = r"%s://%s%s(#%s)?" % (
    _safe_fragment_pattern,
 )

-is_safe_char = re.compile("(?i)^%s$" % _safe_char).match
 is_safe_url = re.compile("(?i)^%s$" % safe_url_pattern).match
 is_safe_domain = re.compile("(?i)^%s$" % _safe_domain_pattern).match
-is_safe_host = re.compile("(?i)^%s$" % _safe_host_pattern).match
-is_safe_path = re.compile("(?i)^%s$" % _safe_path_pattern).match
-is_safe_parameter = re.compile("(?i)^%s$" % _safe_param_pattern).match
-is_safe_query = re.compile("(?i)^%s$" % _safe_query_pattern).match
-is_safe_fragment = re.compile("(?i)^%s$" % _safe_fragment_pattern).match


 # snatched form urlparse.py
@ -118,16 +101,6 @@ def is_numeric_port(portstr):
    return False


-def safe_host_pattern(host):
-    """Return regular expression pattern with given host for URL testing."""
-    return "(?i)%s://%s%s(#%s)?" % (
-        _safe_scheme_pattern,
-        host,
-        _safe_path_pattern,
-        _safe_fragment_pattern,
-    )
-
-
 def parse_qsl(qs, encoding, keep_blank_values=0, strict_parsing=0):
    """Parse a query given as a string argument.

@ -244,15 +217,6 @@ def url_fix_host(urlparts, encoding):
    return is_idn


-def url_fix_common_typos(url):
-    """Fix common typos in given URL like forgotten colon."""
-    if url.startswith("http//"):
-        url = "http://" + url[6:]
-    elif url.startswith("https//"):
-        url = "https://" + url[7:]
-    return url
-
-
 def url_fix_mailto_urlsplit(urlparts):
    """Split query part of mailto url if found."""
    sep = "?"
@ -426,27 +390,6 @@ def document_quote(document):
    return doc


-def match_url(url, domainlist):
-    """Return True if host part of url matches an entry in given domain list.
-    """
-    if not url:
-        return False
-    return match_host(url_split(url)[1], domainlist)
-
-
-def match_host(host, domainlist):
-    """Return True if host matches an entry in given domain list."""
-    if not host:
-        return False
-    for domain in domainlist:
-        if domain.startswith('.'):
-            if host.endswith(domain):
-                return True
-        elif host == domain:
-            return True
-    return False
-
-
 _nopathquote_chars = "-;/=,~*+()@!"
 if os.name == 'nt':
    _nopathquote_chars += "|"
@ -467,27 +410,6 @@ def url_needs_quoting(url):
    return not _safe_url_chars_ro.match(url)


-def url_split(url):
-    """Split url in a tuple (scheme, hostname, port, document) where
-    hostname is always lowercased.
-    Precondition: url is syntactically correct URI (eg has no whitespace)
-    """
-    scheme, netloc = urllib.parse.splittype(url)
-    host, document = urllib.parse.splithost(netloc)
-    port = default_ports.get(scheme, 0)
-    if host:
-        host = host.lower()
-        host, port = splitport(host, port=port)
-    return scheme, host, port, document
-
-
-def url_unsplit(parts):
-    """Rejoin URL parts to a string."""
-    if parts[2] == default_ports.get(parts[0]):
-        return "%s://%s%s" % (parts[0], parts[1], parts[3])
-    return "%s://%s:%d%s" % parts
-
-
 def splitport(host, port=0):
    """Split optional port number from host. If host has no port number,
    the given default port is returned.
@ -512,75 +434,3 @@ def splitport(host, port=0):
            # For an invalid non-empty port leave the host name as is
            pass
    return host, port
-
-
-def get_content(url, user=None, password=None, proxy=None, data=None, addheaders=None):
-    """Get URL content and info.
-
-    @return: (decoded text content of URL, headers) or
-             (None, errmsg) on error.
-    @rtype: tuple (String, dict) or (None, String)
-    """
-    from . import configuration
-
-    headers = {
-        'User-Agent': configuration.UserAgent,
-    }
-    if addheaders:
-        headers.update(addheaders)
-    method = 'GET'
-    kwargs = dict(headers=headers)
-    if user and password:
-        kwargs['auth'] = (user, password)
-    if data:
-        kwargs['data'] = data
-        method = 'POST'
-    if proxy:
-        kwargs['proxy'] = dict(http=proxy)
-    from .configuration import get_share_file
-
-    try:
-        kwargs["verify"] = get_share_file('cacert.pem')
-    except ValueError:
-        pass
-    try:
-        response = requests.request(method, url, **kwargs)
-        return response.text, response.headers
-    except (
-        requests.exceptions.RequestException,
-        requests.exceptions.BaseHTTPError,
-    ) as msg:
-        log.warn(
-            LOG_CHECK,
-            ("Could not get content of URL %(url)s: %(msg)s.")
-            % {"url": url, "msg": str(msg)},
-        )
-        return None, str(msg)
-
-
-def shorten_duplicate_content_url(url):
-    """Remove anchor part and trailing index.html from URL."""
-    if '#' in url:
-        url = url.split('#', 1)[0]
-    if url.endswith('index.html'):
-        return url[:-10]
-    if url.endswith('index.htm'):
-        return url[:-9]
-    return url
-
-
-def is_duplicate_content_url(url1, url2):
-    """Check if both URLs are allowed to point to the same content."""
-    if url1 == url2:
-        return True
-    if url2 in url1:
-        url1 = shorten_duplicate_content_url(url1)
-        if not url2.endswith('/') and url1.endswith('/'):
-            url2 += '/'
-        return url1 == url2
-    if url1 in url2:
-        url2 = shorten_duplicate_content_url(url2)
-        if not url1.endswith('/') and url2.endswith('/'):
-            url1 += '/'
-        return url1 == url2
-    return False
--- a/tests/test_url.py
+++ b/tests/test_url.py
@ -17,10 +17,10 @@
 """
 Test url routines.
 """
-from . import need_network, need_posix, need_windows
+from . import need_posix, need_windows
 import unittest
 import os
-import re
+
 import linkcheck.url

 # 'ftp://user:pass@ftp.foo.net/foo/bar':
@ -67,16 +67,6 @@ class TestUrl(unittest.TestCase):
            linkcheck.url.url_quote(url_norm(url), encoding="iso-8859-1"), nurl
        )

-    def test_safe_patterns(self):
-        is_safe_host = linkcheck.url.is_safe_host
-        safe_host_pattern = linkcheck.url.safe_host_pattern
-        self.assertTrue(is_safe_host("example.org"))
-        self.assertTrue(is_safe_host("example.org:80"))
-        self.assertTrue(not is_safe_host("example.org:21"))
-        pat = safe_host_pattern("example.org")
-        ro = re.compile(pat)
-        self.assertTrue(ro.match("http://example.org:80/"))
-
    def test_url_quote(self):
        def url_quote(url):
            return linkcheck.url.url_quote(url, encoding="utf-8")
@ -429,18 +419,6 @@ class TestUrl(unittest.TestCase):
        nurl = "%C3%A4%C3%B6%C3%BC?:"
        self.urlnormtest(url, nurl)

-    def test_fixing(self):
-        # Test url fix method.
-        url = "http//www.example.org"
-        nurl = "http://www.example.org"
-        self.assertEqual(linkcheck.url.url_fix_common_typos(url), nurl)
-        url = "http//www.example.org"
-        nurl = "http://www.example.org"
-        self.assertEqual(linkcheck.url.url_fix_common_typos(url), nurl)
-        url = "https//www.example.org"
-        nurl = "https://www.example.org"
-        self.assertEqual(linkcheck.url.url_fix_common_typos(url), nurl)
-
    def test_valid(self):
        # Test url validity functions.
        u = "http://www.example.com"
@ -505,20 +483,6 @@ class TestUrl(unittest.TestCase):
        url = "ä.."
        self.assertRaises(UnicodeError, idna_encode, url)

-    def test_match_host(self):
-        # Test host matching.
-        match_host = linkcheck.url.match_host
-        match_url = linkcheck.url.match_url
-        self.assertTrue(not match_host("", []))
-        self.assertTrue(not match_host("", [".localhost"]))
-        self.assertTrue(not match_host("localhost", []))
-        self.assertTrue(not match_host("localhost", [".localhost"]))
-        self.assertTrue(match_host("a.localhost", [".localhost"]))
-        self.assertTrue(match_host("localhost", ["localhost"]))
-        self.assertTrue(not match_url("", []))
-        self.assertTrue(not match_url("a", []))
-        self.assertTrue(match_url("http://example.org/hulla", ["example.org"]))
-
    def test_splitparam(self):
        # Path parameter split test.
        p = [
@ -554,14 +518,6 @@ class TestUrl(unittest.TestCase):
        self.assertFalse(is_numeric_port("-1"))
        self.assertFalse(is_numeric_port("a"))

-    def test_split(self):
-        url_split = linkcheck.url.url_split
-        url_unsplit = linkcheck.url.url_unsplit
-        url = "http://example.org/whoops"
-        self.assertEqual(url_unsplit(url_split(url)), url)
-        url = "http://example.org:123/whoops"
-        self.assertEqual(url_unsplit(url_split(url)), url)
-
    def test_safe_domain(self):
        is_safe_domain = linkcheck.url.is_safe_domain
        self.assertFalse(is_safe_domain("a..example.com"))
@ -569,20 +525,6 @@ class TestUrl(unittest.TestCase):
        self.assertTrue(is_safe_domain("a-b.example.com"))
        self.assertTrue(is_safe_domain("x1.example.com"))

-    @need_network
-    def test_get_content(self):
-        linkcheck.url.get_content("http://www.debian.org/")
-
-    def test_duplicate_urls(self):
-        is_dup = linkcheck.url.is_duplicate_content_url
-        self.assertTrue(is_dup("http://example.org", "http://example.org"))
-        self.assertTrue(is_dup("http://example.org/", "http://example.org"))
-        self.assertTrue(is_dup("http://example.org", "http://example.org/"))
-        self.assertTrue(is_dup("http://example.org/index.html", "http://example.org"))
-        self.assertTrue(is_dup("http://example.org", "http://example.org/index.html"))
-        self.assertTrue(is_dup("http://example.org/index.htm", "http://example.org"))
-        self.assertTrue(is_dup("http://example.org", "http://example.org/index.htm"))
-
    def test_splitport(self):
        splitport = linkcheck.url.splitport
        netloc = "hostname"