diff --git a/doc/src/code/index.rst b/doc/src/code/index.rst index 41f9b963..8c93fbda 100644 --- a/doc/src/code/index.rst +++ b/doc/src/code/index.rst @@ -44,22 +44,20 @@ according to the URL scheme. "7" [label="ItmsServicesUrl", shape="record", href="../code/linkcheck/linkcheck.checker.itmsservicesurl.html", target="_blank"]; "8" [label="MailtoUrl", shape="record", href="../code/linkcheck/linkcheck.checker.mailtourl.html", target="_blank"]; "9" [label="NntpUrl", shape="record", href="../code/linkcheck/linkcheck.checker.nntpurl.html", target="_blank"]; - "10" [label="ProxySupport", shape="record", href="../code/linkcheck/linkcheck.checker.proxysupport.html", target="_blank"]; - "11" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"]; - "12" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"]; - "13" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"]; - "1" -> "13" [arrowhead="empty", arrowtail="none"]; - "2" -> "13" [arrowhead="empty", arrowtail="none"]; + "10" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"]; + "11" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"]; + "12" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"]; + "1" -> "12" [arrowhead="empty", arrowtail="none"]; + "2" -> "12" [arrowhead="empty", arrowtail="none"]; "3" -> "6" [arrowhead="empty", arrowtail="none"]; "4" -> "6" [arrowhead="empty", arrowtail="none"]; - "4" -> "10" [arrowhead="empty", arrowtail="none"]; - "5" -> "12" [arrowhead="empty", arrowtail="none"]; - "6" -> "13" [arrowhead="empty", arrowtail="none"]; - "7" -> "13" [arrowhead="empty", arrowtail="none"]; - "8" -> "13" [arrowhead="empty", arrowtail="none"]; - "9" -> "13" [arrowhead="empty", arrowtail="none"]; - "11" -> "13" [arrowhead="empty", arrowtail="none"]; - "12" -> "13" [arrowhead="empty", arrowtail="none"]; + "5" -> "11" [arrowhead="empty", arrowtail="none"]; + "6" -> "12" [arrowhead="empty", arrowtail="none"]; + "7" -> "12" [arrowhead="empty", arrowtail="none"]; + "8" -> "12" [arrowhead="empty", arrowtail="none"]; + "9" -> "12" [arrowhead="empty", arrowtail="none"]; + "10" -> "12" [arrowhead="empty", arrowtail="none"]; + "11" -> "12" [arrowhead="empty", arrowtail="none"]; } diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index 5d2083f1..a6c90399 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -57,8 +57,6 @@ class RobotsTxt: return rp.can_fetch(self.useragent, url_data.url) self.misses += 1 kwargs = dict(auth=url_data.auth, session=url_data.session, timeout=timeout) - if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"): - kwargs["proxies"] = {url_data.proxytype: url_data.proxy} rp = robotparser2.RobotFileParser(**kwargs) rp.set_url(roboturl) rp.read() diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 19ecf8f1..585f5a3b 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -40,7 +40,7 @@ from .. import ( LinkCheckerError, httputil, ) -from . import internpaturl, proxysupport +from . import internpaturl # import warnings from .const import WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED @@ -52,7 +52,7 @@ HTTP_SCHEMAS = ('http://', 'https://') nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE) -class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): +class HttpUrl(internpaturl.InternPatternUrl): """ Url link with http scheme. """ @@ -131,8 +131,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): valid request """ self.session = self.aggregate.get_request_session() - # set the proxy, so a 407 status after this is an error - self.set_proxy(self.aggregate.config["proxy"].get(self.scheme)) self.construct_auth() # check robots.txt if not self.allows_robots(self.url): @@ -254,8 +252,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): """Construct keyword parameters for Session.request() and Session.resolve_redirects().""" kwargs = dict(stream=True, timeout=self.aggregate.config["timeout"]) - if self.proxy: - kwargs["proxies"] = {self.proxytype: self.proxy} if self.scheme == "https" and self.aggregate.config["sslverify"]: kwargs['verify'] = self.aggregate.config["sslverify"] else: diff --git a/linkcheck/checker/proxysupport.py b/linkcheck/checker/proxysupport.py deleted file mode 100644 index b454ed6c..00000000 --- a/linkcheck/checker/proxysupport.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (C) 2000-2014 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" -Mixin class for URLs that can be fetched over a proxy. -""" -import urllib.parse -import urllib.request -import os -from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil - - -class ProxySupport: - """Get support for proxying and for URLs with user:pass@host setting.""" - - def set_proxy(self, proxy): - """Parse given proxy information and store parsed values. - Note that only *http://* proxies are supported, both for *ftp://* - and *http://* URLs. - """ - self.proxy = proxy - self.proxytype = "http" - if not self.proxy: - return - proxyurl = urllib.parse.urlparse(self.proxy) - self.proxytype = proxyurl.scheme - if self.proxytype not in ('http', 'https'): - # Note that invalid proxies might raise TypeError in urllib2, - # so make sure to stop checking at this point, not later. - msg = _( - "Proxy value `%(proxy)s' must start with 'http:' or 'https:'." - ) % dict(proxy=proxy) - raise LinkCheckerError(msg) - if self.ignore_proxy_host(): - # log proxy without auth info - log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy) - self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % dict(proxy=proxy)) - self.proxy = None - return - log.debug(LOG_CHECK, "using proxy %r", self.proxy) - self.add_info(_("Using proxy `%(proxy)s'.") % dict(proxy=self.proxy)) - self.proxyhost = proxyurl.hostname - self.proxyport = proxyurl.port - - def ignore_proxy_host(self): - """Check if self.host is in the $no_proxy ignore list.""" - if urllib.request.proxy_bypass(self.host): - return True - no_proxy = os.environ.get("no_proxy") - if no_proxy: - entries = [urlutil.splitport(x.strip()) for x in no_proxy.split(",")] - for host, port in entries: - if host.lower() == self.host and port == self.port: - return True - return False - - def get_netloc(self): - """Determine scheme, host and port for this connection taking - proxy data into account. - @return: tuple (scheme, host, port) - @rtype: tuple(string, string, int) - """ - if self.proxy: - scheme = self.proxytype - host = self.proxyhost - port = self.proxyport - else: - scheme = self.scheme - host = self.host - port = self.port - return (scheme, host, port) diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py index a9125696..31e88754 100644 --- a/linkcheck/director/aggregator.py +++ b/linkcheck/director/aggregator.py @@ -38,6 +38,8 @@ _downloadedbytes_lock = threading.RLock() def new_request_session(config, cookies): """Create a new request session.""" session = requests.Session() + if config["proxy"]: + session.proxies.update(config["proxy"]) if cookies: session.cookies = cookies session.max_redirects = config["maxhttpredirects"] @@ -83,7 +85,7 @@ class Aggregate: session = new_request_session(self.config, self.cookies) log.debug(LOG_CHECK, "Getting login form %s", url) kwargs = dict(timeout=self.config["timeout"]) - # XXX: proxy? sslverify? can we reuse HttpUrl.get_request_kwargs() + # XXX: sslverify? can we reuse HttpUrl.get_request_kwargs() # somehow? response = session.get(url, **kwargs) response.raise_for_status() diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 59974ada..53d19484 100644 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -35,15 +35,11 @@ class RobotFileParser: """This class provides a set of methods to read, parse and answer questions about a single robots.txt file.""" - def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None): + def __init__(self, session, url='', auth=None, timeout=None): """Initialize internal entry lists and store given url and credentials.""" self.set_url(url) - if session is None: - self.session = requests.Session() - else: - self.session = session - self.proxies = proxies + self.session = session self.auth = auth self.timeout = timeout self._reset() @@ -91,8 +87,6 @@ class RobotFileParser: ) if self.auth: kwargs["auth"] = self.auth - if self.proxies: - kwargs["proxies"] = self.proxies if self.timeout: kwargs["timeout"] = self.timeout try: diff --git a/tests/checker/__init__.py b/tests/checker/__init__.py index adf68ef1..d22ed4ac 100644 --- a/tests/checker/__init__.py +++ b/tests/checker/__init__.py @@ -115,7 +115,6 @@ class TestLogger(linkcheck.logger._Logger): if ( "Last modified" not in info and "is located in" not in info - and "Using proxy" not in info ): self.result.append("info %s" % info) if self.has_part("warning"): diff --git a/tests/checker/test_noproxy.py b/tests/checker/test_noproxy.py deleted file mode 100644 index 4b766339..00000000 --- a/tests/checker/test_noproxy.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (C) 2004-2012 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" -Test proxy handling. -""" - -from unittest.mock import patch - -from . import httpserver - - -class TestProxy(httpserver.HttpServerTest): - """Test no_proxy env var handling.""" - - def test_noproxy(self): - with patch.dict("os.environ", - { - "http_proxy": "http://example.org:8877", - "no_proxy": "localhost:%d" % self.port, - }): - self.noproxy_test() - - def noproxy_test(self): - # Test setting proxy and no_proxy env variable. - url = self.get_url("favicon.ico") - nurl = url - resultlines = [ - "url %s" % url, - "cache key %s" % nurl, - "real url %s" % nurl, - "info Ignoring proxy setting `http://example.org:8877'.", - "valid", - ] - self.direct(url, resultlines, recursionlevel=0) diff --git a/tests/test_robotparser.py b/tests/test_robotparser.py index a53f594b..bebf3afd 100644 --- a/tests/test_robotparser.py +++ b/tests/test_robotparser.py @@ -21,6 +21,8 @@ import unittest from tests import need_network from linkcheck import configuration, robotparser2 +import requests + class TestRobotParser(unittest.TestCase): """ @@ -29,7 +31,7 @@ class TestRobotParser(unittest.TestCase): def setUp(self): """Initialize self.rp as a robots.txt parser.""" - self.rp = robotparser2.RobotFileParser() + self.rp = robotparser2.RobotFileParser(session=requests.Session()) def check(self, a, b): """Helper function comparing two results a and b.""" diff --git a/tests/test_robotstxt.py b/tests/test_robotstxt.py index 4b1e81e1..8aca9849 100644 --- a/tests/test_robotstxt.py +++ b/tests/test_robotstxt.py @@ -31,7 +31,7 @@ class TestRobotsTxt(unittest.TestCase): """ Initialize self.rp as a robots.txt parser. """ - self.rp = linkcheck.robotparser2.RobotFileParser() + self.rp = linkcheck.robotparser2.RobotFileParser(session=None) def test_robotstxt(self): lines = [