Remove linkcheck.checker.proxysupport

Set up the requests.Session() with the complete proxy configuration to fix a problem with using an HTTP server as an HTTPS proxy and potential redirection issues. Requests handles no_proxy.
2026-05-28 15:48:16 +00:00 · 2021-12-13 19:25:23 +00:00 · 2021-12-13 19:25:23 +00:00 · fe5a34c68f
commit fe5a34c68f
parent 35ecb7e639
10 changed files with 23 additions and 164 deletions
--- a/doc/src/code/index.rst
+++ b/doc/src/code/index.rst
@ -44,22 +44,20 @@ according to the URL scheme.
   "7" [label="ItmsServicesUrl", shape="record", href="../code/linkcheck/linkcheck.checker.itmsservicesurl.html", target="_blank"];
   "8" [label="MailtoUrl", shape="record", href="../code/linkcheck/linkcheck.checker.mailtourl.html", target="_blank"];
   "9" [label="NntpUrl", shape="record", href="../code/linkcheck/linkcheck.checker.nntpurl.html", target="_blank"];
-   "10" [label="ProxySupport", shape="record", href="../code/linkcheck/linkcheck.checker.proxysupport.html", target="_blank"];
-   "11" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"];
-   "12" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"];
-   "13" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"];
-   "1" -> "13" [arrowhead="empty", arrowtail="none"];
-   "2" -> "13" [arrowhead="empty", arrowtail="none"];
+   "10" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"];
+   "11" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"];
+   "12" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"];
+   "1" -> "12" [arrowhead="empty", arrowtail="none"];
+   "2" -> "12" [arrowhead="empty", arrowtail="none"];
   "3" -> "6" [arrowhead="empty", arrowtail="none"];
   "4" -> "6" [arrowhead="empty", arrowtail="none"];
-   "4" -> "10" [arrowhead="empty", arrowtail="none"];
-   "5" -> "12" [arrowhead="empty", arrowtail="none"];
-   "6" -> "13" [arrowhead="empty", arrowtail="none"];
-   "7" -> "13" [arrowhead="empty", arrowtail="none"];
-   "8" -> "13" [arrowhead="empty", arrowtail="none"];
-   "9" -> "13" [arrowhead="empty", arrowtail="none"];
-   "11" -> "13" [arrowhead="empty", arrowtail="none"];
-   "12" -> "13" [arrowhead="empty", arrowtail="none"];
+   "5" -> "11" [arrowhead="empty", arrowtail="none"];
+   "6" -> "12" [arrowhead="empty", arrowtail="none"];
+   "7" -> "12" [arrowhead="empty", arrowtail="none"];
+   "8" -> "12" [arrowhead="empty", arrowtail="none"];
+   "9" -> "12" [arrowhead="empty", arrowtail="none"];
+   "10" -> "12" [arrowhead="empty", arrowtail="none"];
+   "11" -> "12" [arrowhead="empty", arrowtail="none"];
   }


--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -57,8 +57,6 @@ class RobotsTxt:
                return rp.can_fetch(self.useragent, url_data.url)
            self.misses += 1
        kwargs = dict(auth=url_data.auth, session=url_data.session, timeout=timeout)
-        if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
-            kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
        rp = robotparser2.RobotFileParser(**kwargs)
        rp.set_url(roboturl)
        rp.read()
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -40,7 +40,7 @@ from .. import (
    LinkCheckerError,
    httputil,
 )
-from . import internpaturl, proxysupport
+from . import internpaturl

 # import warnings
 from .const import WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED
@ -52,7 +52,7 @@ HTTP_SCHEMAS = ('http://', 'https://')
 nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)


-class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
+class HttpUrl(internpaturl.InternPatternUrl):
    """
    Url link with http scheme.
    """
@ -131,8 +131,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
            valid request
        """
        self.session = self.aggregate.get_request_session()
-        # set the proxy, so a 407 status after this is an error
-        self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
        self.construct_auth()
        # check robots.txt
        if not self.allows_robots(self.url):
@ -254,8 +252,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """Construct keyword parameters for Session.request() and
        Session.resolve_redirects()."""
        kwargs = dict(stream=True, timeout=self.aggregate.config["timeout"])
-        if self.proxy:
-            kwargs["proxies"] = {self.proxytype: self.proxy}
        if self.scheme == "https" and self.aggregate.config["sslverify"]:
            kwargs['verify'] = self.aggregate.config["sslverify"]
        else:
--- a/linkcheck/checker/proxysupport.py
+++ b/linkcheck/checker/proxysupport.py
@ -1,83 +0,0 @@
-# Copyright (C) 2000-2014 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-Mixin class for URLs that can be fetched over a proxy.
-"""
-import urllib.parse
-import urllib.request
-import os
-from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil
-
-
-class ProxySupport:
-    """Get support for proxying and for URLs with user:pass@host setting."""
-
-    def set_proxy(self, proxy):
-        """Parse given proxy information and store parsed values.
-        Note that only *http://* proxies are supported, both for *ftp://*
-        and *http://* URLs.
-        """
-        self.proxy = proxy
-        self.proxytype = "http"
-        if not self.proxy:
-            return
-        proxyurl = urllib.parse.urlparse(self.proxy)
-        self.proxytype = proxyurl.scheme
-        if self.proxytype not in ('http', 'https'):
-            # Note that invalid proxies might raise TypeError in urllib2,
-            # so make sure to stop checking at this point, not later.
-            msg = _(
-                "Proxy value `%(proxy)s' must start with 'http:' or 'https:'."
-            ) % dict(proxy=proxy)
-            raise LinkCheckerError(msg)
-        if self.ignore_proxy_host():
-            # log proxy without auth info
-            log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy)
-            self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % dict(proxy=proxy))
-            self.proxy = None
-            return
-        log.debug(LOG_CHECK, "using proxy %r", self.proxy)
-        self.add_info(_("Using proxy `%(proxy)s'.") % dict(proxy=self.proxy))
-        self.proxyhost = proxyurl.hostname
-        self.proxyport = proxyurl.port
-
-    def ignore_proxy_host(self):
-        """Check if self.host is in the $no_proxy ignore list."""
-        if urllib.request.proxy_bypass(self.host):
-            return True
-        no_proxy = os.environ.get("no_proxy")
-        if no_proxy:
-            entries = [urlutil.splitport(x.strip()) for x in no_proxy.split(",")]
-            for host, port in entries:
-                if host.lower() == self.host and port == self.port:
-                    return True
-        return False
-
-    def get_netloc(self):
-        """Determine scheme, host and port for this connection taking
-        proxy data into account.
-        @return: tuple (scheme, host, port)
-        @rtype: tuple(string, string, int)
-        """
-        if self.proxy:
-            scheme = self.proxytype
-            host = self.proxyhost
-            port = self.proxyport
-        else:
-            scheme = self.scheme
-            host = self.host
-            port = self.port
-        return (scheme, host, port)
--- a/linkcheck/director/aggregator.py
+++ b/linkcheck/director/aggregator.py
@ -38,6 +38,8 @@ _downloadedbytes_lock = threading.RLock()
 def new_request_session(config, cookies):
    """Create a new request session."""
    session = requests.Session()
+    if config["proxy"]:
+        session.proxies.update(config["proxy"])
    if cookies:
        session.cookies = cookies
    session.max_redirects = config["maxhttpredirects"]
@ -83,7 +85,7 @@ class Aggregate:
        session = new_request_session(self.config, self.cookies)
        log.debug(LOG_CHECK, "Getting login form %s", url)
        kwargs = dict(timeout=self.config["timeout"])
-        # XXX: proxy?  sslverify?  can we reuse HttpUrl.get_request_kwargs()
+        # XXX: sslverify?  can we reuse HttpUrl.get_request_kwargs()
        # somehow?
        response = session.get(url, **kwargs)
        response.raise_for_status()
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -35,15 +35,11 @@ class RobotFileParser:
    """This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file."""

-    def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None):
+    def __init__(self, session, url='', auth=None, timeout=None):
        """Initialize internal entry lists and store given url and
        credentials."""
        self.set_url(url)
-        if session is None:
-            self.session = requests.Session()
-        else:
-            self.session = session
-        self.proxies = proxies
+        self.session = session
        self.auth = auth
        self.timeout = timeout
        self._reset()
@ -91,8 +87,6 @@ class RobotFileParser:
        )
        if self.auth:
            kwargs["auth"] = self.auth
-        if self.proxies:
-            kwargs["proxies"] = self.proxies
        if self.timeout:
            kwargs["timeout"] = self.timeout
        try:
--- a/tests/checker/init.py
+++ b/tests/checker/init.py
@ -115,7 +115,6 @@ class TestLogger(linkcheck.logger._Logger):
                if (
                    "Last modified" not in info
                    and "is located in" not in info
-                    and "Using proxy" not in info
                ):
                    self.result.append("info %s" % info)
        if self.has_part("warning"):
--- a/tests/checker/test_noproxy.py
+++ b/tests/checker/test_noproxy.py
@ -1,47 +0,0 @@
-# Copyright (C) 2004-2012 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-Test proxy handling.
-"""
-
-from unittest.mock import patch
-
-from . import httpserver
-
-
-class TestProxy(httpserver.HttpServerTest):
-    """Test no_proxy env var handling."""
-
-    def test_noproxy(self):
-        with patch.dict("os.environ",
-                        {
-                            "http_proxy": "http://example.org:8877",
-                            "no_proxy": "localhost:%d" % self.port,
-                        }):
-            self.noproxy_test()
-
-    def noproxy_test(self):
-        # Test setting proxy and no_proxy env variable.
-        url = self.get_url("favicon.ico")
-        nurl = url
-        resultlines = [
-            "url %s" % url,
-            "cache key %s" % nurl,
-            "real url %s" % nurl,
-            "info Ignoring proxy setting `http://example.org:8877'.",
-            "valid",
-        ]
-        self.direct(url, resultlines, recursionlevel=0)
--- a/tests/test_robotparser.py
+++ b/tests/test_robotparser.py
@ -21,6 +21,8 @@ import unittest
 from tests import need_network
 from linkcheck import configuration, robotparser2

+import requests
+

 class TestRobotParser(unittest.TestCase):
    """
@ -29,7 +31,7 @@ class TestRobotParser(unittest.TestCase):

    def setUp(self):
        """Initialize self.rp as a robots.txt parser."""
-        self.rp = robotparser2.RobotFileParser()
+        self.rp = robotparser2.RobotFileParser(session=requests.Session())

    def check(self, a, b):
        """Helper function comparing two results a and b."""
--- a/tests/test_robotstxt.py
+++ b/tests/test_robotstxt.py
@ -31,7 +31,7 @@ class TestRobotsTxt(unittest.TestCase):
        """
        Initialize self.rp as a robots.txt parser.
        """
-        self.rp = linkcheck.robotparser2.RobotFileParser()
+        self.rp = linkcheck.robotparser2.RobotFileParser(session=None)

    def test_robotstxt(self):
        lines = [