mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-30 02:54:42 +00:00
Fix useragent matching in robots.txt parser.
This commit is contained in:
parent
1c2a66ffaf
commit
f6b007f757
2 changed files with 10 additions and 6 deletions
|
|
@ -131,6 +131,7 @@ class RobotFileParser (object):
|
|||
if ct and ct.lower().startswith("text/plain"):
|
||||
self.parse([line.strip() for line in res])
|
||||
else:
|
||||
log.debug(LOG_CHECK, "%r allow all (not text content)", self.url)
|
||||
self.allow_all = True
|
||||
finally:
|
||||
if res is not None:
|
||||
|
|
@ -240,14 +241,16 @@ class RobotFileParser (object):
|
|||
@rtype: bool
|
||||
"""
|
||||
log.debug(LOG_CHECK, "%r check allowance for:\n" \
|
||||
" user agent: %r\n url: %r", self.url, useragent, url)
|
||||
" user agent: %r\n url: %r ...", self.url, useragent, url)
|
||||
if not isinstance(useragent, str):
|
||||
useragent = useragent.encode("ascii", "ignore")
|
||||
if not isinstance(url, str):
|
||||
url = url.encode("ascii", "ignore")
|
||||
if self.disallow_all:
|
||||
log.debug(LOG_CHECK, " ... disallow all.")
|
||||
return False
|
||||
if self.allow_all:
|
||||
log.debug(LOG_CHECK, " ... allow all.")
|
||||
return True
|
||||
# search for given user agent matches
|
||||
# the first match counts
|
||||
|
|
@ -259,6 +262,7 @@ class RobotFileParser (object):
|
|||
if self.default_entry is not None:
|
||||
return self.default_entry.allowance(url)
|
||||
# agent not found ==> access granted
|
||||
log.debug(LOG_CHECK, " ... agent not found, allow.")
|
||||
return True
|
||||
|
||||
def get_crawldelay (self, useragent):
|
||||
|
|
@ -345,14 +349,12 @@ class Entry (object):
|
|||
"""
|
||||
if not useragent:
|
||||
return True
|
||||
# split the name token and make it lower case unicode
|
||||
useragent = useragent.split("/")[0].lower()
|
||||
useragent = useragent.lower()
|
||||
for agent in self.useragents:
|
||||
if agent == '*':
|
||||
# we have the catch-all agent
|
||||
return True
|
||||
agent = agent.lower()
|
||||
if useragent in agent:
|
||||
if agent.lower() in useragent:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -369,5 +371,7 @@ class Entry (object):
|
|||
for line in self.rulelines:
|
||||
log.debug(LOG_CHECK, "%s %s %s", filename, str(line), line.allowance)
|
||||
if line.applies_to(filename):
|
||||
log.debug(LOG_CHECK, " ... rule line %s", line)
|
||||
return line.allowance
|
||||
log.debug(LOG_CHECK, " ... no rule lines of %s applied to %s; allowed.", self.useragents, filename)
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -271,7 +271,7 @@ class TestRobotsTxt (unittest.TestCase):
|
|||
|
||||
def test_access7 (self):
|
||||
lines = [
|
||||
"User-agent: Example*",
|
||||
"User-agent: Example",
|
||||
"Disallow: /example",
|
||||
"",
|
||||
"User-agent: *",
|
||||
|
|
|
|||
Loading…
Reference in a new issue