2014-02-28 23:12:34 +00:00
|
|
|
# Copyright (C) 2000-2014 Bastian Kleineidam
|
2005-01-18 01:00:45 +00:00
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
2009-07-24 21:58:20 +00:00
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
2005-01-19 15:08:02 +00:00
|
|
|
"""
|
2005-01-19 21:11:43 +00:00
|
|
|
Robots.txt parser.
|
2005-01-19 15:08:02 +00:00
|
|
|
|
|
|
|
|
The robots.txt Exclusion Protocol is implemented as specified in
|
2021-08-12 18:28:50 +00:00
|
|
|
https://www.robotstxt.org/norobots-rfc.txt
|
2005-01-19 15:08:02 +00:00
|
|
|
"""
|
2005-07-14 17:56:05 +00:00
|
|
|
import time
|
2020-05-14 19:15:28 +00:00
|
|
|
import urllib.parse
|
2018-01-05 16:16:35 +00:00
|
|
|
|
2014-02-28 23:12:34 +00:00
|
|
|
import requests
|
2018-01-05 16:16:35 +00:00
|
|
|
|
2019-10-05 18:38:57 +00:00
|
|
|
from . import log, LOG_CHECK, configuration
|
2003-09-23 21:59:17 +00:00
|
|
|
|
|
|
|
|
__all__ = ["RobotFileParser"]
|
|
|
|
|
|
2012-09-23 13:06:44 +00:00
|
|
|
ACCEPT_ENCODING = 'x-gzip,gzip,deflate'
|
2005-02-07 12:10:17 +00:00
|
|
|
|
2018-01-05 16:16:35 +00:00
|
|
|
|
2020-04-30 18:57:47 +00:00
|
|
|
class RobotFileParser:
|
2008-04-27 11:39:21 +00:00
|
|
|
"""This class provides a set of methods to read, parse and answer
|
|
|
|
|
questions about a single robots.txt file."""
|
2004-07-20 14:49:44 +00:00
|
|
|
|
2021-12-13 19:25:23 +00:00
|
|
|
def __init__(self, session, url='', auth=None, timeout=None):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Initialize internal entry lists and store given url and
|
|
|
|
|
credentials."""
|
2003-09-23 21:59:17 +00:00
|
|
|
self.set_url(url)
|
2021-12-13 19:25:23 +00:00
|
|
|
self.session = session
|
2014-07-14 17:50:11 +00:00
|
|
|
self.auth = auth
|
2020-05-22 07:53:33 +00:00
|
|
|
self.timeout = timeout
|
2003-09-23 21:59:17 +00:00
|
|
|
self._reset()
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def _reset(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Reset internal flags and entry lists."""
|
2003-09-23 21:59:17 +00:00
|
|
|
self.entries = []
|
|
|
|
|
self.default_entry = None
|
|
|
|
|
self.disallow_all = False
|
|
|
|
|
self.allow_all = False
|
|
|
|
|
self.last_checked = 0
|
2014-03-01 19:25:19 +00:00
|
|
|
# list of tuples (sitemap url, line number)
|
2014-03-01 18:57:57 +00:00
|
|
|
self.sitemap_urls = []
|
2019-10-05 18:38:57 +00:00
|
|
|
self.encoding = None
|
2003-09-23 21:59:17 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def mtime(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Returns the time the robots.txt file was last fetched.
|
2003-09-23 21:59:17 +00:00
|
|
|
|
|
|
|
|
This is useful for long-running web spiders that need to
|
|
|
|
|
check for new robots.txt files periodically.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
|
|
|
|
@return: last modified in time.time() format
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: number
|
2003-09-23 21:59:17 +00:00
|
|
|
"""
|
|
|
|
|
return self.last_checked
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def modified(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Set the time the robots.txt file was last fetched to the
|
|
|
|
|
current time."""
|
2003-09-23 21:59:17 +00:00
|
|
|
self.last_checked = time.time()
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def set_url(self, url):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Set the URL referring to a robots.txt file."""
|
2003-09-23 21:59:17 +00:00
|
|
|
self.url = url
|
2020-05-14 19:15:28 +00:00
|
|
|
self.host, self.path = urllib.parse.urlparse(url)[1:3]
|
2003-09-23 21:59:17 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Read the robots.txt URL and feeds it to the parser."""
|
2003-09-23 21:59:17 +00:00
|
|
|
self._reset()
|
2014-07-14 17:50:11 +00:00
|
|
|
kwargs = dict(
|
2020-05-30 16:01:36 +00:00
|
|
|
headers={
|
2014-07-14 17:50:11 +00:00
|
|
|
'User-Agent': configuration.UserAgent,
|
|
|
|
|
'Accept-Encoding': ACCEPT_ENCODING,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
if self.auth:
|
|
|
|
|
kwargs["auth"] = self.auth
|
2020-05-22 07:53:33 +00:00
|
|
|
if self.timeout:
|
|
|
|
|
kwargs["timeout"] = self.timeout
|
2003-09-23 21:59:17 +00:00
|
|
|
try:
|
2014-07-14 18:28:28 +00:00
|
|
|
response = self.session.get(self.url, **kwargs)
|
2014-02-28 23:12:34 +00:00
|
|
|
response.raise_for_status()
|
2021-12-13 19:31:55 +00:00
|
|
|
log.debug(LOG_CHECK, "Robots response headers: %s", response.headers)
|
2014-02-28 23:12:34 +00:00
|
|
|
content_type = response.headers.get('content-type')
|
2021-12-13 19:31:55 +00:00
|
|
|
self.encoding = response.encoding = "utf-8"
|
2014-02-28 23:12:34 +00:00
|
|
|
if content_type and content_type.lower().startswith('text/plain'):
|
2018-01-05 19:59:34 +00:00
|
|
|
self.parse(response.iter_lines(decode_unicode=True))
|
2014-02-28 23:12:34 +00:00
|
|
|
else:
|
|
|
|
|
log.debug(LOG_CHECK, "%r allow all (no text content)", self.url)
|
|
|
|
|
self.allow_all = True
|
2014-07-15 13:41:59 +00:00
|
|
|
except requests.HTTPError as x:
|
2014-02-28 23:12:34 +00:00
|
|
|
if x.response.status_code in (401, 403):
|
2003-09-23 21:59:17 +00:00
|
|
|
self.disallow_all = True
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r disallow all (code %d)",
|
|
|
|
|
self.url,
|
|
|
|
|
x.response.status_code,
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
else:
|
|
|
|
|
self.allow_all = True
|
2010-09-04 16:01:12 +00:00
|
|
|
log.debug(LOG_CHECK, "%r allow all (HTTP error)", self.url)
|
2014-02-28 23:12:34 +00:00
|
|
|
except requests.exceptions.Timeout:
|
2006-05-14 12:58:31 +00:00
|
|
|
raise
|
2014-02-28 23:12:34 +00:00
|
|
|
except requests.exceptions.RequestException:
|
|
|
|
|
# no network or other failure
|
2003-09-23 21:59:17 +00:00
|
|
|
self.allow_all = True
|
2014-02-28 23:12:34 +00:00
|
|
|
log.debug(LOG_CHECK, "%r allow all (request error)", self.url)
|
2003-09-23 21:59:17 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def _add_entry(self, entry):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Add a parsed entry to entry list.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
2005-01-19 15:56:48 +00:00
|
|
|
@return: None
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2003-09-23 21:59:17 +00:00
|
|
|
if "*" in entry.useragents:
|
|
|
|
|
# the default entry is considered last
|
|
|
|
|
self.default_entry = entry
|
|
|
|
|
else:
|
|
|
|
|
self.entries.append(entry)
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def parse(self, lines):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Parse the input lines from a robot.txt file.
|
2005-01-18 01:00:45 +00:00
|
|
|
We allow that a user-agent: line is not preceded by
|
|
|
|
|
one or more blank lines.
|
|
|
|
|
|
2005-01-19 15:56:48 +00:00
|
|
|
@return: None
|
2004-08-16 19:20:53 +00:00
|
|
|
"""
|
2014-02-28 23:12:34 +00:00
|
|
|
log.debug(LOG_CHECK, "%r parse lines", self.url)
|
2003-09-23 21:59:17 +00:00
|
|
|
state = 0
|
|
|
|
|
linenumber = 0
|
|
|
|
|
entry = Entry()
|
|
|
|
|
|
|
|
|
|
for line in lines:
|
2014-02-28 23:12:34 +00:00
|
|
|
line = line.strip()
|
2003-09-23 21:59:17 +00:00
|
|
|
linenumber += 1
|
|
|
|
|
if not line:
|
2004-08-16 19:20:53 +00:00
|
|
|
if state == 1:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
2020-06-03 19:06:36 +00:00
|
|
|
"%r line %d: allow or disallow directives without any"
|
|
|
|
|
" user-agent line",
|
2020-05-30 16:01:36 +00:00
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
entry = Entry()
|
|
|
|
|
state = 0
|
2004-08-16 19:20:53 +00:00
|
|
|
elif state == 2:
|
2003-09-23 21:59:17 +00:00
|
|
|
self._add_entry(entry)
|
|
|
|
|
entry = Entry()
|
|
|
|
|
state = 0
|
|
|
|
|
# remove optional comment and strip line
|
|
|
|
|
i = line.find('#')
|
2004-08-16 19:20:53 +00:00
|
|
|
if i >= 0:
|
2003-09-23 21:59:17 +00:00
|
|
|
line = line[:i]
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if not line:
|
|
|
|
|
continue
|
|
|
|
|
line = line.split(':', 1)
|
|
|
|
|
if len(line) == 2:
|
|
|
|
|
line[0] = line[0].strip().lower()
|
2020-05-14 19:15:28 +00:00
|
|
|
line[1] = urllib.parse.unquote(line[1].strip(), self.encoding)
|
2003-09-23 21:59:17 +00:00
|
|
|
if line[0] == "user-agent":
|
2004-08-16 19:20:53 +00:00
|
|
|
if state == 2:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
2020-06-03 19:06:36 +00:00
|
|
|
"%r line %d: missing blank line before"
|
|
|
|
|
" user-agent directive",
|
2020-05-30 16:01:36 +00:00
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
self._add_entry(entry)
|
|
|
|
|
entry = Entry()
|
|
|
|
|
entry.useragents.append(line[1])
|
|
|
|
|
state = 1
|
|
|
|
|
elif line[0] == "disallow":
|
2004-08-16 19:20:53 +00:00
|
|
|
if state == 0:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r line %d: missing user-agent directive before this line",
|
|
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
else:
|
2006-10-19 20:36:31 +00:00
|
|
|
entry.rulelines.append(RuleLine(line[1], False))
|
2003-09-23 21:59:17 +00:00
|
|
|
state = 2
|
|
|
|
|
elif line[0] == "allow":
|
2004-08-16 19:20:53 +00:00
|
|
|
if state == 0:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r line %d: missing user-agent directive before this line",
|
|
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
else:
|
2006-10-19 20:36:31 +00:00
|
|
|
entry.rulelines.append(RuleLine(line[1], True))
|
2006-09-21 09:14:28 +00:00
|
|
|
state = 2
|
2006-05-16 21:29:18 +00:00
|
|
|
elif line[0] == "crawl-delay":
|
|
|
|
|
if state == 0:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r line %d: missing user-agent directive before this line",
|
|
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
)
|
2006-05-16 21:29:18 +00:00
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
entry.crawldelay = max(0, int(line[1]))
|
|
|
|
|
state = 2
|
2014-03-01 18:58:22 +00:00
|
|
|
except (ValueError, OverflowError):
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r line %d: invalid delay number %r",
|
|
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
line[1],
|
|
|
|
|
)
|
2014-03-01 18:57:57 +00:00
|
|
|
elif line[0] == "sitemap":
|
|
|
|
|
# Note that sitemap URLs must be absolute according to
|
|
|
|
|
# http://www.sitemaps.org/protocol.html#submit_robots
|
|
|
|
|
# But this should be checked by the calling layer.
|
2014-03-01 19:25:19 +00:00
|
|
|
self.sitemap_urls.append((line[1], linenumber))
|
2003-09-23 21:59:17 +00:00
|
|
|
else:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r line %d: unknown key %r",
|
|
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
line[0],
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
else:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r line %d: malformed line %r",
|
|
|
|
|
self.url,
|
|
|
|
|
linenumber,
|
|
|
|
|
line,
|
|
|
|
|
)
|
2006-05-16 21:29:18 +00:00
|
|
|
if state in (1, 2):
|
2003-09-23 21:59:17 +00:00
|
|
|
self.entries.append(entry)
|
2006-06-05 19:44:59 +00:00
|
|
|
self.modified()
|
2008-04-27 11:39:21 +00:00
|
|
|
log.debug(LOG_CHECK, "Parsed rules:\n%s", str(self))
|
2003-09-23 21:59:17 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def can_fetch(self, useragent, url):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Using the parsed robots.txt decide if useragent can fetch url.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
|
|
|
|
@return: True if agent can fetch url, else False
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: bool
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
"%r check allowance for:\n user agent: %r\n url: %r ...",
|
|
|
|
|
self.url,
|
|
|
|
|
useragent,
|
|
|
|
|
url,
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
if self.disallow_all:
|
2012-09-21 19:12:13 +00:00
|
|
|
log.debug(LOG_CHECK, " ... disallow all.")
|
2003-09-23 21:59:17 +00:00
|
|
|
return False
|
|
|
|
|
if self.allow_all:
|
2012-09-21 19:12:13 +00:00
|
|
|
log.debug(LOG_CHECK, " ... allow all.")
|
2003-09-23 21:59:17 +00:00
|
|
|
return True
|
|
|
|
|
# search for given user agent matches
|
|
|
|
|
# the first match counts
|
2020-05-30 16:01:36 +00:00
|
|
|
url = (
|
|
|
|
|
urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2])
|
|
|
|
|
or "/"
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
for entry in self.entries:
|
|
|
|
|
if entry.applies_to(useragent):
|
|
|
|
|
return entry.allowance(url)
|
|
|
|
|
# try the default entry last
|
|
|
|
|
if self.default_entry is not None:
|
|
|
|
|
return self.default_entry.allowance(url)
|
|
|
|
|
# agent not found ==> access granted
|
2012-09-21 19:12:13 +00:00
|
|
|
log.debug(LOG_CHECK, " ... agent not found, allow.")
|
2003-09-23 21:59:17 +00:00
|
|
|
return True
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def get_crawldelay(self, useragent):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Look for a configured crawl delay.
|
2006-05-17 15:35:48 +00:00
|
|
|
|
|
|
|
|
@return: crawl delay in seconds or zero
|
|
|
|
|
@rtype: integer >= 0
|
|
|
|
|
"""
|
|
|
|
|
for entry in self.entries:
|
|
|
|
|
if entry.applies_to(useragent):
|
|
|
|
|
return entry.crawldelay
|
|
|
|
|
return 0
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def __str__(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Constructs string representation, usable as contents of a
|
2005-01-18 01:00:45 +00:00
|
|
|
robots.txt file.
|
|
|
|
|
|
|
|
|
|
@return: robots.txt format
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: string
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2003-09-23 21:59:17 +00:00
|
|
|
lines = [str(entry) for entry in self.entries]
|
|
|
|
|
if self.default_entry is not None:
|
|
|
|
|
lines.append(str(self.default_entry))
|
|
|
|
|
return "\n\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
2020-04-30 18:57:47 +00:00
|
|
|
class RuleLine:
|
2008-04-27 11:39:21 +00:00
|
|
|
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
|
2005-01-18 01:00:45 +00:00
|
|
|
(allowance==0) followed by a path.
|
|
|
|
|
"""
|
2004-07-20 14:49:44 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def __init__(self, path, allowance):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Initialize with given path and allowance info."""
|
2003-09-23 21:59:17 +00:00
|
|
|
if path == '' and not allowance:
|
|
|
|
|
# an empty value means allow all
|
|
|
|
|
allowance = True
|
2007-10-02 01:06:24 +00:00
|
|
|
path = '/'
|
2020-05-14 19:15:28 +00:00
|
|
|
self.path = urllib.parse.quote(path)
|
2003-09-23 21:59:17 +00:00
|
|
|
self.allowance = allowance
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def applies_to(self, path):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Look if given path applies to this rule.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
|
|
|
|
@return: True if pathname applies to this rule, else False
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: bool
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2004-08-16 19:20:53 +00:00
|
|
|
return self.path == "*" or path.startswith(self.path)
|
2003-09-23 21:59:17 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def __str__(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Construct string representation in robots.txt format.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
|
|
|
|
@return: robots.txt format
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: string
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2020-05-30 16:01:36 +00:00
|
|
|
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
|
2003-09-23 21:59:17 +00:00
|
|
|
|
|
|
|
|
|
2020-04-30 18:57:47 +00:00
|
|
|
class Entry:
|
2008-04-27 11:39:21 +00:00
|
|
|
"""An entry has one or more user-agents and zero or more rulelines."""
|
2004-07-20 14:49:44 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def __init__(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Initialize user agent and rule list."""
|
2003-09-23 21:59:17 +00:00
|
|
|
self.useragents = []
|
|
|
|
|
self.rulelines = []
|
2006-05-16 21:29:18 +00:00
|
|
|
self.crawldelay = 0
|
2003-09-23 21:59:17 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def __str__(self):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""string representation in robots.txt format.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
|
|
|
|
@return: robots.txt format
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: string
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2006-05-16 21:29:18 +00:00
|
|
|
lines = ["User-agent: %s" % agent for agent in self.useragents]
|
|
|
|
|
if self.crawldelay:
|
|
|
|
|
lines.append("Crawl-delay: %d" % self.crawldelay)
|
2003-09-23 21:59:17 +00:00
|
|
|
lines.extend([str(line) for line in self.rulelines])
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def applies_to(self, useragent):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Check if this entry applies to the specified agent.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
|
|
|
|
@return: True if this entry applies to the agent, else False.
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: bool
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2003-09-23 21:59:17 +00:00
|
|
|
if not useragent:
|
|
|
|
|
return True
|
2012-09-21 19:12:13 +00:00
|
|
|
useragent = useragent.lower()
|
2003-09-23 21:59:17 +00:00
|
|
|
for agent in self.useragents:
|
2004-08-16 19:20:53 +00:00
|
|
|
if agent == '*':
|
2003-09-23 21:59:17 +00:00
|
|
|
# we have the catch-all agent
|
|
|
|
|
return True
|
2012-09-21 19:12:13 +00:00
|
|
|
if agent.lower() in useragent:
|
2003-09-23 21:59:17 +00:00
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def allowance(self, filename):
|
2008-04-27 11:39:21 +00:00
|
|
|
"""Preconditions:
|
2003-09-23 21:59:17 +00:00
|
|
|
- our agent applies to this entry
|
2005-01-18 01:00:45 +00:00
|
|
|
- filename is URL decoded
|
|
|
|
|
|
2022-09-02 09:20:02 +00:00
|
|
|
Check if given filename is allowed to access this entry.
|
2005-01-18 01:00:45 +00:00
|
|
|
|
|
|
|
|
@return: True if allowed, else False
|
2005-01-19 15:56:48 +00:00
|
|
|
@rtype: bool
|
2005-01-18 01:00:45 +00:00
|
|
|
"""
|
2003-09-23 21:59:17 +00:00
|
|
|
for line in self.rulelines:
|
2009-03-04 22:49:00 +00:00
|
|
|
log.debug(LOG_CHECK, "%s %s %s", filename, str(line), line.allowance)
|
|
|
|
|
if line.applies_to(filename):
|
2012-09-21 19:12:13 +00:00
|
|
|
log.debug(LOG_CHECK, " ... rule line %s", line)
|
2003-09-23 21:59:17 +00:00
|
|
|
return line.allowance
|
2020-05-30 16:01:36 +00:00
|
|
|
log.debug(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
" ... no rule lines of %s applied to %s; allowed.",
|
|
|
|
|
self.useragents,
|
|
|
|
|
filename,
|
|
|
|
|
)
|
2003-09-23 21:59:17 +00:00
|
|
|
return True
|