Merge pull request #408 from linkchecker/fix-timeouts

Make sure fetching robots.txt uses the configured timeout
2026-05-04 21:04:41 +00:00 · 2020-05-22 14:29:12 -04:00 · 2020-05-22 14:29:12 -04:00 · 2256a6e889
commit 2256a6e889
parent 639ba0dba2 4f3fe5e1c3
3 changed files with 13 additions and 6 deletions
--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -41,13 +41,13 @@ class RobotsTxt:
        self.roboturl_locks = {}
        self.useragent = useragent

-    def allows_url(self, url_data):
+    def allows_url(self, url_data, timeout=None):
        """Ask robots.txt allowance."""
        roboturl = url_data.get_robots_txt_url()
        with self.get_lock(roboturl):
-            return self._allows_url(url_data, roboturl)
+            return self._allows_url(url_data, roboturl, timeout)

-    def _allows_url(self, url_data, roboturl):
+    def _allows_url(self, url_data, roboturl, timeout=None):
        """Ask robots.txt allowance. Assumes only single thread per robots.txt
        URL calls this function."""
        with cache_lock:
@ -56,7 +56,8 @@ class RobotsTxt:
                rp = self.cache[roboturl]
                return rp.can_fetch(self.useragent, url_data.url)
            self.misses += 1
-        kwargs = dict(auth=url_data.auth, session=url_data.session)
+        kwargs = dict(auth=url_data.auth, session=url_data.session,
+                      timeout=timeout)
        if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
            kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
        rp = robotparser2.RobotFileParser(**kwargs)
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -73,7 +73,9 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        @return: True if access is granted, otherwise False
        @rtype: bool
        """
-        return not self.aggregate.config['robotstxt'] or self.aggregate.robots_txt.allows_url(self)
+        return (not self.aggregate.config['robotstxt']
+                or self.aggregate.robots_txt.allows_url(
+                    self, timeout=self.aggregate.config["timeout"]))

    def content_allows_robots(self):
        """
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -35,7 +35,8 @@ class RobotFileParser:
    """This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file."""

-    def __init__(self, url='', session=None, proxies=None, auth=None):
+    def __init__(self, url='', session=None, proxies=None, auth=None,
+                 timeout=None):
        """Initialize internal entry lists and store given url and
        credentials."""
        self.set_url(url)
@ -45,6 +46,7 @@ class RobotFileParser:
            self.session = session
        self.proxies = proxies
        self.auth = auth
+        self.timeout = timeout
        self._reset()

    def _reset(self):
@ -92,6 +94,8 @@ class RobotFileParser:
            kwargs["auth"] = self.auth
        if self.proxies:
            kwargs["proxies"] = self.proxies
+        if self.timeout:
+            kwargs["timeout"] = self.timeout
        try:
            response = self.session.get(self.url, **kwargs)
            response.raise_for_status()