mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-28 15:48:16 +00:00
Remove linkcheck.checker.proxysupport
Set up the requests.Session() with the complete proxy configuration to fix a problem with using an HTTP server as an HTTPS proxy and potential redirection issues. Requests handles no_proxy.
This commit is contained in:
parent
35ecb7e639
commit
fe5a34c68f
10 changed files with 23 additions and 164 deletions
|
|
@ -44,22 +44,20 @@ according to the URL scheme.
|
|||
"7" [label="ItmsServicesUrl", shape="record", href="../code/linkcheck/linkcheck.checker.itmsservicesurl.html", target="_blank"];
|
||||
"8" [label="MailtoUrl", shape="record", href="../code/linkcheck/linkcheck.checker.mailtourl.html", target="_blank"];
|
||||
"9" [label="NntpUrl", shape="record", href="../code/linkcheck/linkcheck.checker.nntpurl.html", target="_blank"];
|
||||
"10" [label="ProxySupport", shape="record", href="../code/linkcheck/linkcheck.checker.proxysupport.html", target="_blank"];
|
||||
"11" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"];
|
||||
"12" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"];
|
||||
"13" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"];
|
||||
"1" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"2" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"10" [label="TelnetUrl", shape="record", href="../code/linkcheck/linkcheck.checker.telneturl.html", target="_blank"];
|
||||
"11" [label="UnknownUrl", shape="record", href="../code/linkcheck/linkcheck.checker.unknownurl.html", target="_blank"];
|
||||
"12" [label="UrlBase", shape="record", href="../code/linkcheck/linkcheck.checker.urlbase.html", target="_blank"];
|
||||
"1" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"2" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"3" -> "6" [arrowhead="empty", arrowtail="none"];
|
||||
"4" -> "6" [arrowhead="empty", arrowtail="none"];
|
||||
"4" -> "10" [arrowhead="empty", arrowtail="none"];
|
||||
"5" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"6" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"7" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"8" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"9" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"11" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"12" -> "13" [arrowhead="empty", arrowtail="none"];
|
||||
"5" -> "11" [arrowhead="empty", arrowtail="none"];
|
||||
"6" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"7" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"8" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"9" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"10" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
"11" -> "12" [arrowhead="empty", arrowtail="none"];
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
2
linkcheck/cache/robots_txt.py
vendored
2
linkcheck/cache/robots_txt.py
vendored
|
|
@ -57,8 +57,6 @@ class RobotsTxt:
|
|||
return rp.can_fetch(self.useragent, url_data.url)
|
||||
self.misses += 1
|
||||
kwargs = dict(auth=url_data.auth, session=url_data.session, timeout=timeout)
|
||||
if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
|
||||
kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
|
||||
rp = robotparser2.RobotFileParser(**kwargs)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ from .. import (
|
|||
LinkCheckerError,
|
||||
httputil,
|
||||
)
|
||||
from . import internpaturl, proxysupport
|
||||
from . import internpaturl
|
||||
|
||||
# import warnings
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED
|
||||
|
|
@ -52,7 +52,7 @@ HTTP_SCHEMAS = ('http://', 'https://')
|
|||
nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
|
||||
|
||||
|
||||
class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
||||
class HttpUrl(internpaturl.InternPatternUrl):
|
||||
"""
|
||||
Url link with http scheme.
|
||||
"""
|
||||
|
|
@ -131,8 +131,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
valid request
|
||||
"""
|
||||
self.session = self.aggregate.get_request_session()
|
||||
# set the proxy, so a 407 status after this is an error
|
||||
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
|
||||
self.construct_auth()
|
||||
# check robots.txt
|
||||
if not self.allows_robots(self.url):
|
||||
|
|
@ -254,8 +252,6 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""Construct keyword parameters for Session.request() and
|
||||
Session.resolve_redirects()."""
|
||||
kwargs = dict(stream=True, timeout=self.aggregate.config["timeout"])
|
||||
if self.proxy:
|
||||
kwargs["proxies"] = {self.proxytype: self.proxy}
|
||||
if self.scheme == "https" and self.aggregate.config["sslverify"]:
|
||||
kwargs['verify'] = self.aggregate.config["sslverify"]
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -1,83 +0,0 @@
|
|||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Mixin class for URLs that can be fetched over a proxy.
|
||||
"""
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import os
|
||||
from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil
|
||||
|
||||
|
||||
class ProxySupport:
|
||||
"""Get support for proxying and for URLs with user:pass@host setting."""
|
||||
|
||||
def set_proxy(self, proxy):
|
||||
"""Parse given proxy information and store parsed values.
|
||||
Note that only *http://* proxies are supported, both for *ftp://*
|
||||
and *http://* URLs.
|
||||
"""
|
||||
self.proxy = proxy
|
||||
self.proxytype = "http"
|
||||
if not self.proxy:
|
||||
return
|
||||
proxyurl = urllib.parse.urlparse(self.proxy)
|
||||
self.proxytype = proxyurl.scheme
|
||||
if self.proxytype not in ('http', 'https'):
|
||||
# Note that invalid proxies might raise TypeError in urllib2,
|
||||
# so make sure to stop checking at this point, not later.
|
||||
msg = _(
|
||||
"Proxy value `%(proxy)s' must start with 'http:' or 'https:'."
|
||||
) % dict(proxy=proxy)
|
||||
raise LinkCheckerError(msg)
|
||||
if self.ignore_proxy_host():
|
||||
# log proxy without auth info
|
||||
log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy)
|
||||
self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % dict(proxy=proxy))
|
||||
self.proxy = None
|
||||
return
|
||||
log.debug(LOG_CHECK, "using proxy %r", self.proxy)
|
||||
self.add_info(_("Using proxy `%(proxy)s'.") % dict(proxy=self.proxy))
|
||||
self.proxyhost = proxyurl.hostname
|
||||
self.proxyport = proxyurl.port
|
||||
|
||||
def ignore_proxy_host(self):
|
||||
"""Check if self.host is in the $no_proxy ignore list."""
|
||||
if urllib.request.proxy_bypass(self.host):
|
||||
return True
|
||||
no_proxy = os.environ.get("no_proxy")
|
||||
if no_proxy:
|
||||
entries = [urlutil.splitport(x.strip()) for x in no_proxy.split(",")]
|
||||
for host, port in entries:
|
||||
if host.lower() == self.host and port == self.port:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_netloc(self):
|
||||
"""Determine scheme, host and port for this connection taking
|
||||
proxy data into account.
|
||||
@return: tuple (scheme, host, port)
|
||||
@rtype: tuple(string, string, int)
|
||||
"""
|
||||
if self.proxy:
|
||||
scheme = self.proxytype
|
||||
host = self.proxyhost
|
||||
port = self.proxyport
|
||||
else:
|
||||
scheme = self.scheme
|
||||
host = self.host
|
||||
port = self.port
|
||||
return (scheme, host, port)
|
||||
|
|
@ -38,6 +38,8 @@ _downloadedbytes_lock = threading.RLock()
|
|||
def new_request_session(config, cookies):
|
||||
"""Create a new request session."""
|
||||
session = requests.Session()
|
||||
if config["proxy"]:
|
||||
session.proxies.update(config["proxy"])
|
||||
if cookies:
|
||||
session.cookies = cookies
|
||||
session.max_redirects = config["maxhttpredirects"]
|
||||
|
|
@ -83,7 +85,7 @@ class Aggregate:
|
|||
session = new_request_session(self.config, self.cookies)
|
||||
log.debug(LOG_CHECK, "Getting login form %s", url)
|
||||
kwargs = dict(timeout=self.config["timeout"])
|
||||
# XXX: proxy? sslverify? can we reuse HttpUrl.get_request_kwargs()
|
||||
# XXX: sslverify? can we reuse HttpUrl.get_request_kwargs()
|
||||
# somehow?
|
||||
response = session.get(url, **kwargs)
|
||||
response.raise_for_status()
|
||||
|
|
|
|||
|
|
@ -35,15 +35,11 @@ class RobotFileParser:
|
|||
"""This class provides a set of methods to read, parse and answer
|
||||
questions about a single robots.txt file."""
|
||||
|
||||
def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None):
|
||||
def __init__(self, session, url='', auth=None, timeout=None):
|
||||
"""Initialize internal entry lists and store given url and
|
||||
credentials."""
|
||||
self.set_url(url)
|
||||
if session is None:
|
||||
self.session = requests.Session()
|
||||
else:
|
||||
self.session = session
|
||||
self.proxies = proxies
|
||||
self.session = session
|
||||
self.auth = auth
|
||||
self.timeout = timeout
|
||||
self._reset()
|
||||
|
|
@ -91,8 +87,6 @@ class RobotFileParser:
|
|||
)
|
||||
if self.auth:
|
||||
kwargs["auth"] = self.auth
|
||||
if self.proxies:
|
||||
kwargs["proxies"] = self.proxies
|
||||
if self.timeout:
|
||||
kwargs["timeout"] = self.timeout
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -115,7 +115,6 @@ class TestLogger(linkcheck.logger._Logger):
|
|||
if (
|
||||
"Last modified" not in info
|
||||
and "is located in" not in info
|
||||
and "Using proxy" not in info
|
||||
):
|
||||
self.result.append("info %s" % info)
|
||||
if self.has_part("warning"):
|
||||
|
|
|
|||
|
|
@ -1,47 +0,0 @@
|
|||
# Copyright (C) 2004-2012 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Test proxy handling.
|
||||
"""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
from . import httpserver
|
||||
|
||||
|
||||
class TestProxy(httpserver.HttpServerTest):
|
||||
"""Test no_proxy env var handling."""
|
||||
|
||||
def test_noproxy(self):
|
||||
with patch.dict("os.environ",
|
||||
{
|
||||
"http_proxy": "http://example.org:8877",
|
||||
"no_proxy": "localhost:%d" % self.port,
|
||||
}):
|
||||
self.noproxy_test()
|
||||
|
||||
def noproxy_test(self):
|
||||
# Test setting proxy and no_proxy env variable.
|
||||
url = self.get_url("favicon.ico")
|
||||
nurl = url
|
||||
resultlines = [
|
||||
"url %s" % url,
|
||||
"cache key %s" % nurl,
|
||||
"real url %s" % nurl,
|
||||
"info Ignoring proxy setting `http://example.org:8877'.",
|
||||
"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=0)
|
||||
|
|
@ -21,6 +21,8 @@ import unittest
|
|||
from tests import need_network
|
||||
from linkcheck import configuration, robotparser2
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class TestRobotParser(unittest.TestCase):
|
||||
"""
|
||||
|
|
@ -29,7 +31,7 @@ class TestRobotParser(unittest.TestCase):
|
|||
|
||||
def setUp(self):
|
||||
"""Initialize self.rp as a robots.txt parser."""
|
||||
self.rp = robotparser2.RobotFileParser()
|
||||
self.rp = robotparser2.RobotFileParser(session=requests.Session())
|
||||
|
||||
def check(self, a, b):
|
||||
"""Helper function comparing two results a and b."""
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ class TestRobotsTxt(unittest.TestCase):
|
|||
"""
|
||||
Initialize self.rp as a robots.txt parser.
|
||||
"""
|
||||
self.rp = linkcheck.robotparser2.RobotFileParser()
|
||||
self.rp = linkcheck.robotparser2.RobotFileParser(session=None)
|
||||
|
||||
def test_robotstxt(self):
|
||||
lines = [
|
||||
|
|
|
|||
Loading…
Reference in a new issue