Merge pull request #408 from linkchecker/fix-timeouts

Make sure fetching robots.txt uses the configured timeout
This commit is contained in:
anarcat 2020-05-22 14:29:12 -04:00 committed by GitHub
commit 2256a6e889
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 13 additions and 6 deletions

View file

@ -41,13 +41,13 @@ class RobotsTxt:
self.roboturl_locks = {}
self.useragent = useragent
def allows_url(self, url_data):
def allows_url(self, url_data, timeout=None):
"""Ask robots.txt allowance."""
roboturl = url_data.get_robots_txt_url()
with self.get_lock(roboturl):
return self._allows_url(url_data, roboturl)
return self._allows_url(url_data, roboturl, timeout)
def _allows_url(self, url_data, roboturl):
def _allows_url(self, url_data, roboturl, timeout=None):
"""Ask robots.txt allowance. Assumes only single thread per robots.txt
URL calls this function."""
with cache_lock:
@ -56,7 +56,8 @@ class RobotsTxt:
rp = self.cache[roboturl]
return rp.can_fetch(self.useragent, url_data.url)
self.misses += 1
kwargs = dict(auth=url_data.auth, session=url_data.session)
kwargs = dict(auth=url_data.auth, session=url_data.session,
timeout=timeout)
if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
rp = robotparser2.RobotFileParser(**kwargs)

View file

@ -73,7 +73,9 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
@return: True if access is granted, otherwise False
@rtype: bool
"""
return not self.aggregate.config['robotstxt'] or self.aggregate.robots_txt.allows_url(self)
return (not self.aggregate.config['robotstxt']
or self.aggregate.robots_txt.allows_url(
self, timeout=self.aggregate.config["timeout"]))
def content_allows_robots(self):
"""

View file

@ -35,7 +35,8 @@ class RobotFileParser:
"""This class provides a set of methods to read, parse and answer
questions about a single robots.txt file."""
def __init__(self, url='', session=None, proxies=None, auth=None):
def __init__(self, url='', session=None, proxies=None, auth=None,
timeout=None):
"""Initialize internal entry lists and store given url and
credentials."""
self.set_url(url)
@ -45,6 +46,7 @@ class RobotFileParser:
self.session = session
self.proxies = proxies
self.auth = auth
self.timeout = timeout
self._reset()
def _reset(self):
@ -92,6 +94,8 @@ class RobotFileParser:
kwargs["auth"] = self.auth
if self.proxies:
kwargs["proxies"] = self.proxies
if self.timeout:
kwargs["timeout"] = self.timeout
try:
response = self.session.get(self.url, **kwargs)
response.raise_for_status()