mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-04 21:04:41 +00:00
Merge pull request #408 from linkchecker/fix-timeouts
Make sure fetching robots.txt uses the configured timeout
This commit is contained in:
commit
2256a6e889
3 changed files with 13 additions and 6 deletions
9
linkcheck/cache/robots_txt.py
vendored
9
linkcheck/cache/robots_txt.py
vendored
|
|
@ -41,13 +41,13 @@ class RobotsTxt:
|
|||
self.roboturl_locks = {}
|
||||
self.useragent = useragent
|
||||
|
||||
def allows_url(self, url_data):
|
||||
def allows_url(self, url_data, timeout=None):
|
||||
"""Ask robots.txt allowance."""
|
||||
roboturl = url_data.get_robots_txt_url()
|
||||
with self.get_lock(roboturl):
|
||||
return self._allows_url(url_data, roboturl)
|
||||
return self._allows_url(url_data, roboturl, timeout)
|
||||
|
||||
def _allows_url(self, url_data, roboturl):
|
||||
def _allows_url(self, url_data, roboturl, timeout=None):
|
||||
"""Ask robots.txt allowance. Assumes only single thread per robots.txt
|
||||
URL calls this function."""
|
||||
with cache_lock:
|
||||
|
|
@ -56,7 +56,8 @@ class RobotsTxt:
|
|||
rp = self.cache[roboturl]
|
||||
return rp.can_fetch(self.useragent, url_data.url)
|
||||
self.misses += 1
|
||||
kwargs = dict(auth=url_data.auth, session=url_data.session)
|
||||
kwargs = dict(auth=url_data.auth, session=url_data.session,
|
||||
timeout=timeout)
|
||||
if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
|
||||
kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
|
||||
rp = robotparser2.RobotFileParser(**kwargs)
|
||||
|
|
|
|||
|
|
@ -73,7 +73,9 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
@return: True if access is granted, otherwise False
|
||||
@rtype: bool
|
||||
"""
|
||||
return not self.aggregate.config['robotstxt'] or self.aggregate.robots_txt.allows_url(self)
|
||||
return (not self.aggregate.config['robotstxt']
|
||||
or self.aggregate.robots_txt.allows_url(
|
||||
self, timeout=self.aggregate.config["timeout"]))
|
||||
|
||||
def content_allows_robots(self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -35,7 +35,8 @@ class RobotFileParser:
|
|||
"""This class provides a set of methods to read, parse and answer
|
||||
questions about a single robots.txt file."""
|
||||
|
||||
def __init__(self, url='', session=None, proxies=None, auth=None):
|
||||
def __init__(self, url='', session=None, proxies=None, auth=None,
|
||||
timeout=None):
|
||||
"""Initialize internal entry lists and store given url and
|
||||
credentials."""
|
||||
self.set_url(url)
|
||||
|
|
@ -45,6 +46,7 @@ class RobotFileParser:
|
|||
self.session = session
|
||||
self.proxies = proxies
|
||||
self.auth = auth
|
||||
self.timeout = timeout
|
||||
self._reset()
|
||||
|
||||
def _reset(self):
|
||||
|
|
@ -92,6 +94,8 @@ class RobotFileParser:
|
|||
kwargs["auth"] = self.auth
|
||||
if self.proxies:
|
||||
kwargs["proxies"] = self.proxies
|
||||
if self.timeout:
|
||||
kwargs["timeout"] = self.timeout
|
||||
try:
|
||||
response = self.session.get(self.url, **kwargs)
|
||||
response.raise_for_status()
|
||||
|
|
|
|||
Loading…
Reference in a new issue