From 6c38b4165aabd28d33a2ce508344952b19742558 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Mon, 14 Jul 2014 19:50:11 +0200 Subject: [PATCH] Use given HTTP auth data for robots.txt fetching. --- doc/changelog.txt | 2 ++ linkcheck/cache/robots_txt.py | 4 +--- linkcheck/robotparser2.py | 20 ++++++++++++-------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index aeae0270..c84a080d 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -26,6 +26,8 @@ Fixes: Closes: GH bug #521 - cgi: Sanitize configuration. Closes: GH bug #519 +- checking: Use user-supplied authentication when requestiong robot.txt since + since some sites are completely password protected. 9.2 "Rick and Morty" (released 23.4.2014) diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index d8e58590..04ee8cd7 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -51,15 +51,13 @@ class RobotsTxt (object): def _allows_url (self, url_data, roboturl): """Ask robots.txt allowance. Assumes only single thread per robots.txt URL calls this function.""" - user, password = url_data.get_user_password() with cache_lock: if roboturl in self.cache: self.hits += 1 rp = self.cache[roboturl] return rp.can_fetch(self.useragent, url_data.url) self.misses += 1 - rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user, - password=password) + rp = robotparser2.RobotFileParser(proxy=url_data.proxy, auth=url_data.auth) rp.set_url(roboturl) rp.read() with cache_lock: diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 9455b835..f5e732db 100644 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -34,13 +34,13 @@ class RobotFileParser (object): """This class provides a set of methods to read, parse and answer questions about a single robots.txt file.""" - def __init__ (self, url='', proxy=None, user=None, password=None): + def __init__ (self, url='', proxy=None, auth=None): """Initialize internal entry lists and store given url and credentials.""" self.set_url(url) self.proxy = proxy - self.user = user - self.password = password + # XXX proxy + self.auth = auth self._reset() def _reset (self): @@ -77,12 +77,16 @@ class RobotFileParser (object): def read (self): """Read the robots.txt URL and feeds it to the parser.""" self._reset() - headers = { - 'User-Agent': configuration.UserAgent, - 'Accept-Encoding': ACCEPT_ENCODING, - } + kwargs = dict( + headers = { + 'User-Agent': configuration.UserAgent, + 'Accept-Encoding': ACCEPT_ENCODING, + } + ) + if self.auth: + kwargs["auth"] = self.auth try: - response = requests.get(self.url, headers=headers) + response = requests.get(self.url, **kwargs) response.raise_for_status() content_type = response.headers.get('content-type') if content_type and content_type.lower().startswith('text/plain'):