Use given HTTP auth data for robots.txt fetching.

2026-05-17 02:51:07 +00:00 · 2014-07-14 19:50:11 +02:00 · 2014-07-14 19:50:11 +02:00 · 6c38b4165a
commit 6c38b4165a
parent 7838521b6e
3 changed files with 15 additions and 11 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -26,6 +26,8 @@ Fixes:
  Closes: GH bug #521
 - cgi: Sanitize configuration.
  Closes: GH bug #519
+- checking: Use user-supplied authentication when requestiong robot.txt since
+  since some sites are completely password protected.


 9.2 "Rick and Morty" (released 23.4.2014)
--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -51,15 +51,13 @@ class RobotsTxt (object):
    def _allows_url (self, url_data, roboturl):
        """Ask robots.txt allowance. Assumes only single thread per robots.txt
        URL calls this function."""
-        user, password = url_data.get_user_password()
        with cache_lock:
            if roboturl in self.cache:
                self.hits += 1
                rp = self.cache[roboturl]
                return rp.can_fetch(self.useragent, url_data.url)
            self.misses += 1
-        rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user,
-            password=password)
+        rp = robotparser2.RobotFileParser(proxy=url_data.proxy, auth=url_data.auth)
        rp.set_url(roboturl)
        rp.read()
        with cache_lock:
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -34,13 +34,13 @@ class RobotFileParser (object):
    """This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file."""

-    def __init__ (self, url='', proxy=None, user=None, password=None):
+    def __init__ (self, url='', proxy=None, auth=None):
        """Initialize internal entry lists and store given url and
        credentials."""
        self.set_url(url)
        self.proxy = proxy
-        self.user = user
-        self.password = password
+        # XXX proxy
+        self.auth = auth
        self._reset()

    def _reset (self):
@ -77,12 +77,16 @@ class RobotFileParser (object):
    def read (self):
        """Read the robots.txt URL and feeds it to the parser."""
        self._reset()
-        headers = {
-            'User-Agent': configuration.UserAgent,
-            'Accept-Encoding': ACCEPT_ENCODING,
-        }
+        kwargs = dict(
+            headers = {
+                'User-Agent': configuration.UserAgent,
+                'Accept-Encoding': ACCEPT_ENCODING,
+            }
+        )
+        if self.auth:
+            kwargs["auth"] = self.auth
        try:
-            response = requests.get(self.url, headers=headers)
+            response = requests.get(self.url, **kwargs)
            response.raise_for_status()
            content_type = response.headers.get('content-type')
            if content_type and content_type.lower().startswith('text/plain'):