linkchecker/linkcheck/robotparser2.py
Chris Mayo ac0967e251 Fix remaining flake8 violations in linkcheck/
linkcheck/better_exchook2.py:28:89: E501 line too long (90 > 88 characters)
linkcheck/better_exchook2.py:155:9: E722 do not use bare 'except'
linkcheck/better_exchook2.py:166:9: E722 do not use bare 'except'
linkcheck/better_exchook2.py:289:13: E741 ambiguous variable name 'l'
linkcheck/better_exchook2.py:299:9: E722 do not use bare 'except'
linkcheck/containers.py:48:13: E731 do not assign a lambda expression, use a def
linkcheck/ftpparse.py:123:89: E501 line too long (93 > 88 characters)
linkcheck/loader.py:46:47: E203 whitespace before ':'
linkcheck/logconf.py:45:29: E231 missing whitespace after ','
linkcheck/robotparser2.py:157:89: E501 line too long (95 > 88 characters)
linkcheck/robotparser2.py:182:89: E501 line too long (89 > 88 characters)
linkcheck/strformat.py:181:16: E203 whitespace before ':'
linkcheck/strformat.py:181:43: E203 whitespace before ':'
linkcheck/strformat.py:253:9: E731 do not assign a lambda expression, use a def
linkcheck/strformat.py:254:9: E731 do not assign a lambda expression, use a def
linkcheck/strformat.py:341:89: E501 line too long (111 > 88 characters)
linkcheck/url.py:102:32: E203 whitespace before ':'
linkcheck/url.py:277:5: E741 ambiguous variable name 'l'
linkcheck/url.py:402:5: E741 ambiguous variable name 'l'
linkcheck/checker/__init__.py:203:1: E402 module level import not at top of file
linkcheck/checker/fileurl.py:200:89: E501 line too long (103 > 88 characters)
linkcheck/checker/mailtourl.py:122:60: E203 whitespace before ':'
linkcheck/checker/mailtourl.py:157:89: E501 line too long (96 > 88 characters)
linkcheck/checker/mailtourl.py:190:89: E501 line too long (109 > 88 characters)
linkcheck/checker/mailtourl.py:200:89: E501 line too long (111 > 88 characters)
linkcheck/checker/mailtourl.py:249:89: E501 line too long (106 > 88 characters)
linkcheck/checker/unknownurl.py:226:23: W291 trailing whitespace
linkcheck/checker/urlbase.py:245:89: E501 line too long (101 > 88 characters)
linkcheck/configuration/confparse.py:236:89: E501 line too long (186 > 88 characters)
linkcheck/configuration/confparse.py:247:89: E501 line too long (111 > 88 characters)
linkcheck/configuration/__init__.py:164:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:184:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:190:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:195:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:198:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:435:89: E501 line too long (90 > 88 characters)
linkcheck/director/aggregator.py:45:43: E231 missing whitespace after ','
linkcheck/director/aggregator.py:178:89: E501 line too long (106 > 88 characters)
linkcheck/logger/__init__.py:29:1: E731 do not assign a lambda expression, use a def
linkcheck/logger/__init__.py:108:13: E741 ambiguous variable name 'l'
linkcheck/logger/__init__.py:275:19: F821 undefined name '_'
linkcheck/logger/__init__.py:342:16: F821 undefined name '_'
linkcheck/logger/__init__.py:380:13: F821 undefined name '_'
linkcheck/logger/__init__.py:384:13: F821 undefined name '_'
linkcheck/logger/__init__.py:387:13: F821 undefined name '_'
linkcheck/logger/__init__.py:396:13: F821 undefined name '_'
linkcheck/network/__init__.py:1:1: W391 blank line at end of file
linkcheck/plugins/locationinfo.py:89:9: E731 do not assign a lambda expression, use a def
linkcheck/plugins/locationinfo.py:91:9: E731 do not assign a lambda expression, use a def
linkcheck/plugins/markdowncheck.py:112:89: E501 line too long (111 > 88 characters)
linkcheck/plugins/markdowncheck.py:141:9: E741 ambiguous variable name 'l'
linkcheck/plugins/markdowncheck.py:165:23: E203 whitespace before ':'
linkcheck/plugins/viruscheck.py:95:42: E203 whitespace before ':'
2020-05-30 17:01:36 +01:00

419 lines
14 KiB
Python

# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Robots.txt parser.
The robots.txt Exclusion Protocol is implemented as specified in
http://www.robotstxt.org/wc/norobots-rfc.html
"""
import time
import urllib.parse
import requests
from . import log, LOG_CHECK, configuration
__all__ = ["RobotFileParser"]
ACCEPT_ENCODING = 'x-gzip,gzip,deflate'
class RobotFileParser:
"""This class provides a set of methods to read, parse and answer
questions about a single robots.txt file."""
def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None):
"""Initialize internal entry lists and store given url and
credentials."""
self.set_url(url)
if session is None:
self.session = requests.Session()
else:
self.session = session
self.proxies = proxies
self.auth = auth
self.timeout = timeout
self._reset()
def _reset(self):
"""Reset internal flags and entry lists."""
self.entries = []
self.default_entry = None
self.disallow_all = False
self.allow_all = False
self.last_checked = 0
# list of tuples (sitemap url, line number)
self.sitemap_urls = []
self.encoding = None
def mtime(self):
"""Returns the time the robots.txt file was last fetched.
This is useful for long-running web spiders that need to
check for new robots.txt files periodically.
@return: last modified in time.time() format
@rtype: number
"""
return self.last_checked
def modified(self):
"""Set the time the robots.txt file was last fetched to the
current time."""
self.last_checked = time.time()
def set_url(self, url):
"""Set the URL referring to a robots.txt file."""
self.url = url
self.host, self.path = urllib.parse.urlparse(url)[1:3]
def read(self):
"""Read the robots.txt URL and feeds it to the parser."""
self._reset()
kwargs = dict(
headers={
'User-Agent': configuration.UserAgent,
'Accept-Encoding': ACCEPT_ENCODING,
}
)
if self.auth:
kwargs["auth"] = self.auth
if self.proxies:
kwargs["proxies"] = self.proxies
if self.timeout:
kwargs["timeout"] = self.timeout
try:
response = self.session.get(self.url, **kwargs)
response.raise_for_status()
content_type = response.headers.get('content-type')
self.encoding = response.encoding
if content_type and content_type.lower().startswith('text/plain'):
self.parse(response.iter_lines(decode_unicode=True))
else:
log.debug(LOG_CHECK, "%r allow all (no text content)", self.url)
self.allow_all = True
except requests.HTTPError as x:
if x.response.status_code in (401, 403):
self.disallow_all = True
log.debug(
LOG_CHECK,
"%r disallow all (code %d)",
self.url,
x.response.status_code,
)
else:
self.allow_all = True
log.debug(LOG_CHECK, "%r allow all (HTTP error)", self.url)
except requests.exceptions.Timeout:
raise
except requests.exceptions.RequestException:
# no network or other failure
self.allow_all = True
log.debug(LOG_CHECK, "%r allow all (request error)", self.url)
def _add_entry(self, entry):
"""Add a parsed entry to entry list.
@return: None
"""
if "*" in entry.useragents:
# the default entry is considered last
self.default_entry = entry
else:
self.entries.append(entry)
def parse(self, lines):
"""Parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines.
@return: None
"""
log.debug(LOG_CHECK, "%r parse lines", self.url)
state = 0
linenumber = 0
entry = Entry()
for line in lines:
line = line.strip()
linenumber += 1
if not line:
if state == 1:
log.debug(
LOG_CHECK,
"%r line %d: allow or disallow directives without any "
"user-agent line",
self.url,
linenumber,
)
entry = Entry()
state = 0
elif state == 2:
self._add_entry(entry)
entry = Entry()
state = 0
# remove optional comment and strip line
i = line.find('#')
if i >= 0:
line = line[:i]
line = line.strip()
if not line:
continue
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
line[1] = urllib.parse.unquote(line[1].strip(), self.encoding)
if line[0] == "user-agent":
if state == 2:
log.debug(
LOG_CHECK,
"%r line %d: missing blank line before "
"user-agent directive",
self.url,
linenumber,
)
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
elif line[0] == "disallow":
if state == 0:
log.debug(
LOG_CHECK,
"%r line %d: missing user-agent directive before this line",
self.url,
linenumber,
)
pass
else:
entry.rulelines.append(RuleLine(line[1], False))
state = 2
elif line[0] == "allow":
if state == 0:
log.debug(
LOG_CHECK,
"%r line %d: missing user-agent directive before this line",
self.url,
linenumber,
)
pass
else:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
elif line[0] == "crawl-delay":
if state == 0:
log.debug(
LOG_CHECK,
"%r line %d: missing user-agent directive before this line",
self.url,
linenumber,
)
pass
else:
try:
entry.crawldelay = max(0, int(line[1]))
state = 2
except (ValueError, OverflowError):
log.debug(
LOG_CHECK,
"%r line %d: invalid delay number %r",
self.url,
linenumber,
line[1],
)
pass
elif line[0] == "sitemap":
# Note that sitemap URLs must be absolute according to
# http://www.sitemaps.org/protocol.html#submit_robots
# But this should be checked by the calling layer.
self.sitemap_urls.append((line[1], linenumber))
else:
log.debug(
LOG_CHECK,
"%r line %d: unknown key %r",
self.url,
linenumber,
line[0],
)
pass
else:
log.debug(
LOG_CHECK,
"%r line %d: malformed line %r",
self.url,
linenumber,
line,
)
pass
if state in (1, 2):
self.entries.append(entry)
self.modified()
log.debug(LOG_CHECK, "Parsed rules:\n%s", str(self))
def can_fetch(self, useragent, url):
"""Using the parsed robots.txt decide if useragent can fetch url.
@return: True if agent can fetch url, else False
@rtype: bool
"""
log.debug(
LOG_CHECK,
"%r check allowance for:\n user agent: %r\n url: %r ...",
self.url,
useragent,
url,
)
if not isinstance(useragent, str):
useragent = useragent.encode("ascii", "ignore")
if not isinstance(url, str):
url = url.encode("ascii", "ignore")
if self.disallow_all:
log.debug(LOG_CHECK, " ... disallow all.")
return False
if self.allow_all:
log.debug(LOG_CHECK, " ... allow all.")
return True
# search for given user agent matches
# the first match counts
url = (
urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2])
or "/"
)
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
# try the default entry last
if self.default_entry is not None:
return self.default_entry.allowance(url)
# agent not found ==> access granted
log.debug(LOG_CHECK, " ... agent not found, allow.")
return True
def get_crawldelay(self, useragent):
"""Look for a configured crawl delay.
@return: crawl delay in seconds or zero
@rtype: integer >= 0
"""
for entry in self.entries:
if entry.applies_to(useragent):
return entry.crawldelay
return 0
def __str__(self):
"""Constructs string representation, usable as contents of a
robots.txt file.
@return: robots.txt format
@rtype: string
"""
lines = [str(entry) for entry in self.entries]
if self.default_entry is not None:
lines.append(str(self.default_entry))
return "\n\n".join(lines)
class RuleLine:
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path.
"""
def __init__(self, path, allowance):
"""Initialize with given path and allowance info."""
if path == '' and not allowance:
# an empty value means allow all
allowance = True
path = '/'
self.path = urllib.parse.quote(path)
self.allowance = allowance
def applies_to(self, path):
"""Look if given path applies to this rule.
@return: True if pathname applies to this rule, else False
@rtype: bool
"""
return self.path == "*" or path.startswith(self.path)
def __str__(self):
"""Construct string representation in robots.txt format.
@return: robots.txt format
@rtype: string
"""
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
class Entry:
"""An entry has one or more user-agents and zero or more rulelines."""
def __init__(self):
"""Initialize user agent and rule list."""
self.useragents = []
self.rulelines = []
self.crawldelay = 0
def __str__(self):
"""string representation in robots.txt format.
@return: robots.txt format
@rtype: string
"""
lines = ["User-agent: %s" % agent for agent in self.useragents]
if self.crawldelay:
lines.append("Crawl-delay: %d" % self.crawldelay)
lines.extend([str(line) for line in self.rulelines])
return "\n".join(lines)
def applies_to(self, useragent):
"""Check if this entry applies to the specified agent.
@return: True if this entry applies to the agent, else False.
@rtype: bool
"""
if not useragent:
return True
useragent = useragent.lower()
for agent in self.useragents:
if agent == '*':
# we have the catch-all agent
return True
if agent.lower() in useragent:
return True
return False
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL decoded
Check if given filename is allowed to acces this entry.
@return: True if allowed, else False
@rtype: bool
"""
for line in self.rulelines:
log.debug(LOG_CHECK, "%s %s %s", filename, str(line), line.allowance)
if line.applies_to(filename):
log.debug(LOG_CHECK, " ... rule line %s", line)
return line.allowance
log.debug(
LOG_CHECK,
" ... no rule lines of %s applied to %s; allowed.",
self.useragents,
filename,
)
return True