mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-17 02:51:07 +00:00
Use given HTTP auth data for robots.txt fetching.
This commit is contained in:
parent
7838521b6e
commit
6c38b4165a
3 changed files with 15 additions and 11 deletions
|
|
@ -26,6 +26,8 @@ Fixes:
|
|||
Closes: GH bug #521
|
||||
- cgi: Sanitize configuration.
|
||||
Closes: GH bug #519
|
||||
- checking: Use user-supplied authentication when requestiong robot.txt since
|
||||
since some sites are completely password protected.
|
||||
|
||||
|
||||
9.2 "Rick and Morty" (released 23.4.2014)
|
||||
|
|
|
|||
4
linkcheck/cache/robots_txt.py
vendored
4
linkcheck/cache/robots_txt.py
vendored
|
|
@ -51,15 +51,13 @@ class RobotsTxt (object):
|
|||
def _allows_url (self, url_data, roboturl):
|
||||
"""Ask robots.txt allowance. Assumes only single thread per robots.txt
|
||||
URL calls this function."""
|
||||
user, password = url_data.get_user_password()
|
||||
with cache_lock:
|
||||
if roboturl in self.cache:
|
||||
self.hits += 1
|
||||
rp = self.cache[roboturl]
|
||||
return rp.can_fetch(self.useragent, url_data.url)
|
||||
self.misses += 1
|
||||
rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user,
|
||||
password=password)
|
||||
rp = robotparser2.RobotFileParser(proxy=url_data.proxy, auth=url_data.auth)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
with cache_lock:
|
||||
|
|
|
|||
|
|
@ -34,13 +34,13 @@ class RobotFileParser (object):
|
|||
"""This class provides a set of methods to read, parse and answer
|
||||
questions about a single robots.txt file."""
|
||||
|
||||
def __init__ (self, url='', proxy=None, user=None, password=None):
|
||||
def __init__ (self, url='', proxy=None, auth=None):
|
||||
"""Initialize internal entry lists and store given url and
|
||||
credentials."""
|
||||
self.set_url(url)
|
||||
self.proxy = proxy
|
||||
self.user = user
|
||||
self.password = password
|
||||
# XXX proxy
|
||||
self.auth = auth
|
||||
self._reset()
|
||||
|
||||
def _reset (self):
|
||||
|
|
@ -77,12 +77,16 @@ class RobotFileParser (object):
|
|||
def read (self):
|
||||
"""Read the robots.txt URL and feeds it to the parser."""
|
||||
self._reset()
|
||||
headers = {
|
||||
'User-Agent': configuration.UserAgent,
|
||||
'Accept-Encoding': ACCEPT_ENCODING,
|
||||
}
|
||||
kwargs = dict(
|
||||
headers = {
|
||||
'User-Agent': configuration.UserAgent,
|
||||
'Accept-Encoding': ACCEPT_ENCODING,
|
||||
}
|
||||
)
|
||||
if self.auth:
|
||||
kwargs["auth"] = self.auth
|
||||
try:
|
||||
response = requests.get(self.url, headers=headers)
|
||||
response = requests.get(self.url, **kwargs)
|
||||
response.raise_for_status()
|
||||
content_type = response.headers.get('content-type')
|
||||
if content_type and content_type.lower().startswith('text/plain'):
|
||||
|
|
|
|||
Loading…
Reference in a new issue