Remove linkcheck.checker.proxysupport

Set up the requests.Session() with the complete proxy configuration
to fix a problem with using an HTTP server as an HTTPS proxy and
potential redirection issues.

Requests handles no_proxy.
This commit is contained in:
Chris Mayo 2021-12-13 19:25:23 +00:00
parent 35ecb7e639
commit fe5a34c68f
10 changed files with 23 additions and 164 deletions

View file

@ -44,22 +44,20 @@ according to the URL scheme.
"7" [label="ItmsServicesUrl", shape="record", href="../code/linkcheck/linkcheck.checker.itmsservicesurl.html", target="_blank"];
"8" [label="MailtoUrl", shape="record", href="../code/linkcheck/linkcheck.checker.mailtourl.html", target="_blank"];
"9" [label="NntpUrl", shape="record", href="../code/linkcheck/linkcheck.checker.nntpurl.html", target="_blank"];
"10" [label="ProxySupport", shape="record", href="../code/linkcheck/linkcheck.checker.proxysupport.html", target="_blank"];
"11" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"];
"12" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"];
"13" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"];
"1" -> "13" [arrowhead="empty", arrowtail="none"];
"2" -> "13" [arrowhead="empty", arrowtail="none"];
"10" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"];
"11" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"];
"12" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"];
"1" -> "12" [arrowhead="empty", arrowtail="none"];
"2" -> "12" [arrowhead="empty", arrowtail="none"];
"3" -> "6" [arrowhead="empty", arrowtail="none"];
"4" -> "6" [arrowhead="empty", arrowtail="none"];
"4" -> "10" [arrowhead="empty", arrowtail="none"];
"5" -> "12" [arrowhead="empty", arrowtail="none"];
"6" -> "13" [arrowhead="empty", arrowtail="none"];
"7" -> "13" [arrowhead="empty", arrowtail="none"];
"8" -> "13" [arrowhead="empty", arrowtail="none"];
"9" -> "13" [arrowhead="empty", arrowtail="none"];
"11" -> "13" [arrowhead="empty", arrowtail="none"];
"12" -> "13" [arrowhead="empty", arrowtail="none"];
"5" -> "11" [arrowhead="empty", arrowtail="none"];
"6" -> "12" [arrowhead="empty", arrowtail="none"];
"7" -> "12" [arrowhead="empty", arrowtail="none"];
"8" -> "12" [arrowhead="empty", arrowtail="none"];
"9" -> "12" [arrowhead="empty", arrowtail="none"];
"10" -> "12" [arrowhead="empty", arrowtail="none"];
"11" -> "12" [arrowhead="empty", arrowtail="none"];
}

View file

@ -57,8 +57,6 @@ class RobotsTxt:
return rp.can_fetch(self.useragent, url_data.url)
self.misses += 1
kwargs = dict(auth=url_data.auth, session=url_data.session, timeout=timeout)
if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
rp = robotparser2.RobotFileParser(**kwargs)
rp.set_url(roboturl)
rp.read()

View file

@ -40,7 +40,7 @@ from .. import (
LinkCheckerError,
httputil,
)
from . import internpaturl, proxysupport
from . import internpaturl
# import warnings
from .const import WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED
@ -52,7 +52,7 @@ HTTP_SCHEMAS = ('http://', 'https://')
nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
class HttpUrl(internpaturl.InternPatternUrl):
"""
Url link with http scheme.
"""
@ -131,8 +131,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
valid request
"""
self.session = self.aggregate.get_request_session()
# set the proxy, so a 407 status after this is an error
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
self.construct_auth()
# check robots.txt
if not self.allows_robots(self.url):
@ -254,8 +252,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""Construct keyword parameters for Session.request() and
Session.resolve_redirects()."""
kwargs = dict(stream=True, timeout=self.aggregate.config["timeout"])
if self.proxy:
kwargs["proxies"] = {self.proxytype: self.proxy}
if self.scheme == "https" and self.aggregate.config["sslverify"]:
kwargs['verify'] = self.aggregate.config["sslverify"]
else:

View file

@ -1,83 +0,0 @@
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Mixin class for URLs that can be fetched over a proxy.
"""
import urllib.parse
import urllib.request
import os
from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil
class ProxySupport:
"""Get support for proxying and for URLs with user:pass@host setting."""
def set_proxy(self, proxy):
"""Parse given proxy information and store parsed values.
Note that only *http://* proxies are supported, both for *ftp://*
and *http://* URLs.
"""
self.proxy = proxy
self.proxytype = "http"
if not self.proxy:
return
proxyurl = urllib.parse.urlparse(self.proxy)
self.proxytype = proxyurl.scheme
if self.proxytype not in ('http', 'https'):
# Note that invalid proxies might raise TypeError in urllib2,
# so make sure to stop checking at this point, not later.
msg = _(
"Proxy value `%(proxy)s' must start with 'http:' or 'https:'."
) % dict(proxy=proxy)
raise LinkCheckerError(msg)
if self.ignore_proxy_host():
# log proxy without auth info
log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy)
self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % dict(proxy=proxy))
self.proxy = None
return
log.debug(LOG_CHECK, "using proxy %r", self.proxy)
self.add_info(_("Using proxy `%(proxy)s'.") % dict(proxy=self.proxy))
self.proxyhost = proxyurl.hostname
self.proxyport = proxyurl.port
def ignore_proxy_host(self):
"""Check if self.host is in the $no_proxy ignore list."""
if urllib.request.proxy_bypass(self.host):
return True
no_proxy = os.environ.get("no_proxy")
if no_proxy:
entries = [urlutil.splitport(x.strip()) for x in no_proxy.split(",")]
for host, port in entries:
if host.lower() == self.host and port == self.port:
return True
return False
def get_netloc(self):
"""Determine scheme, host and port for this connection taking
proxy data into account.
@return: tuple (scheme, host, port)
@rtype: tuple(string, string, int)
"""
if self.proxy:
scheme = self.proxytype
host = self.proxyhost
port = self.proxyport
else:
scheme = self.scheme
host = self.host
port = self.port
return (scheme, host, port)

View file

@ -38,6 +38,8 @@ _downloadedbytes_lock = threading.RLock()
def new_request_session(config, cookies):
"""Create a new request session."""
session = requests.Session()
if config["proxy"]:
session.proxies.update(config["proxy"])
if cookies:
session.cookies = cookies
session.max_redirects = config["maxhttpredirects"]
@ -83,7 +85,7 @@ class Aggregate:
session = new_request_session(self.config, self.cookies)
log.debug(LOG_CHECK, "Getting login form %s", url)
kwargs = dict(timeout=self.config["timeout"])
# XXX: proxy? sslverify? can we reuse HttpUrl.get_request_kwargs()
# XXX: sslverify? can we reuse HttpUrl.get_request_kwargs()
# somehow?
response = session.get(url, **kwargs)
response.raise_for_status()

View file

@ -35,15 +35,11 @@ class RobotFileParser:
"""This class provides a set of methods to read, parse and answer
questions about a single robots.txt file."""
def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None):
def __init__(self, session, url='', auth=None, timeout=None):
"""Initialize internal entry lists and store given url and
credentials."""
self.set_url(url)
if session is None:
self.session = requests.Session()
else:
self.session = session
self.proxies = proxies
self.session = session
self.auth = auth
self.timeout = timeout
self._reset()
@ -91,8 +87,6 @@ class RobotFileParser:
)
if self.auth:
kwargs["auth"] = self.auth
if self.proxies:
kwargs["proxies"] = self.proxies
if self.timeout:
kwargs["timeout"] = self.timeout
try:

View file

@ -115,7 +115,6 @@ class TestLogger(linkcheck.logger._Logger):
if (
"Last modified" not in info
and "is located in" not in info
and "Using proxy" not in info
):
self.result.append("info %s" % info)
if self.has_part("warning"):

View file

@ -1,47 +0,0 @@
# Copyright (C) 2004-2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test proxy handling.
"""
from unittest.mock import patch
from . import httpserver
class TestProxy(httpserver.HttpServerTest):
"""Test no_proxy env var handling."""
def test_noproxy(self):
with patch.dict("os.environ",
{
"http_proxy": "http://example.org:8877",
"no_proxy": "localhost:%d" % self.port,
}):
self.noproxy_test()
def noproxy_test(self):
# Test setting proxy and no_proxy env variable.
url = self.get_url("favicon.ico")
nurl = url
resultlines = [
"url %s" % url,
"cache key %s" % nurl,
"real url %s" % nurl,
"info Ignoring proxy setting `http://example.org:8877'.",
"valid",
]
self.direct(url, resultlines, recursionlevel=0)

View file

@ -21,6 +21,8 @@ import unittest
from tests import need_network
from linkcheck import configuration, robotparser2
import requests
class TestRobotParser(unittest.TestCase):
"""
@ -29,7 +31,7 @@ class TestRobotParser(unittest.TestCase):
def setUp(self):
"""Initialize self.rp as a robots.txt parser."""
self.rp = robotparser2.RobotFileParser()
self.rp = robotparser2.RobotFileParser(session=requests.Session())
def check(self, a, b):
"""Helper function comparing two results a and b."""

View file

@ -31,7 +31,7 @@ class TestRobotsTxt(unittest.TestCase):
"""
Initialize self.rp as a robots.txt parser.
"""
self.rp = linkcheck.robotparser2.RobotFileParser()
self.rp = linkcheck.robotparser2.RobotFileParser(session=None)
def test_robotstxt(self):
lines = [