Use given HTTP auth data for robots.txt fetching.

This commit is contained in:
Bastian Kleineidam 2014-07-14 19:50:11 +02:00
parent 7838521b6e
commit 6c38b4165a
3 changed files with 15 additions and 11 deletions

View file

@ -26,6 +26,8 @@ Fixes:
Closes: GH bug #521
- cgi: Sanitize configuration.
Closes: GH bug #519
- checking: Use user-supplied authentication when requestiong robot.txt since
since some sites are completely password protected.
9.2 "Rick and Morty" (released 23.4.2014)

View file

@ -51,15 +51,13 @@ class RobotsTxt (object):
def _allows_url (self, url_data, roboturl):
"""Ask robots.txt allowance. Assumes only single thread per robots.txt
URL calls this function."""
user, password = url_data.get_user_password()
with cache_lock:
if roboturl in self.cache:
self.hits += 1
rp = self.cache[roboturl]
return rp.can_fetch(self.useragent, url_data.url)
self.misses += 1
rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user,
password=password)
rp = robotparser2.RobotFileParser(proxy=url_data.proxy, auth=url_data.auth)
rp.set_url(roboturl)
rp.read()
with cache_lock:

View file

@ -34,13 +34,13 @@ class RobotFileParser (object):
"""This class provides a set of methods to read, parse and answer
questions about a single robots.txt file."""
def __init__ (self, url='', proxy=None, user=None, password=None):
def __init__ (self, url='', proxy=None, auth=None):
"""Initialize internal entry lists and store given url and
credentials."""
self.set_url(url)
self.proxy = proxy
self.user = user
self.password = password
# XXX proxy
self.auth = auth
self._reset()
def _reset (self):
@ -77,12 +77,16 @@ class RobotFileParser (object):
def read (self):
"""Read the robots.txt URL and feeds it to the parser."""
self._reset()
headers = {
'User-Agent': configuration.UserAgent,
'Accept-Encoding': ACCEPT_ENCODING,
}
kwargs = dict(
headers = {
'User-Agent': configuration.UserAgent,
'Accept-Encoding': ACCEPT_ENCODING,
}
)
if self.auth:
kwargs["auth"] = self.auth
try:
response = requests.get(self.url, headers=headers)
response = requests.get(self.url, **kwargs)
response.raise_for_status()
content_type = response.headers.get('content-type')
if content_type and content_type.lower().startswith('text/plain'):