git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@988 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2003-08-07 22:36:32 +00:00
parent 525dfdf4c6
commit e560a650cf

View file

@ -10,7 +10,7 @@
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
"""
import re,urlparse,urllib
import urlparse, urllib
__all__ = ["RobotFileParser"]
@ -21,25 +21,43 @@ def _debug(msg):
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
"""
def __init__(self, url=''):
self.entries = []
self.default_entry = None
self.disallow_all = 0
self.allow_all = 0
self.set_url(url)
self.last_checked = 0
def mtime(self):
"""Returns the time the robots.txt file was last fetched.
This is useful for long-running web spiders that need to
check for new robots.txt files periodically.
"""
return self.last_checked
def modified(self):
"""Sets the time the robots.txt file was last fetched to the
current time.
"""
import time
self.last_checked = time.time()
def set_url(self, url):
"""Sets the URL referring to a robots.txt file."""
self.url = url
self.host, self.path = urlparse.urlparse(url)[1:3]
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
opener = URLopener()
f = opener.open(self.url)
lines = []
@ -58,6 +76,13 @@ class RobotFileParser:
_debug("parse lines")
self.parse(lines)
def _add_entry(self, entry):
if "*" in entry.useragents:
# the default entry is considered last
self.default_entry = entry
else:
self.entries.append(entry)
def parse(self, lines):
"""parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
@ -76,7 +101,7 @@ class RobotFileParser:
entry = Entry()
state = 0
elif state==2:
self.entries.append(entry)
self._add_entry(entry)
entry = Entry()
state = 0
# remove optional comment and strip line
@ -89,13 +114,13 @@ class RobotFileParser:
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
line[1] = line[1].strip()
line[1] = urllib.unquote(line[1].strip())
if line[0] == "user-agent":
if state==2:
_debug("line %d: warning: you should insert a blank"
" line before any user-agent"
" directive" % linenumber)
self.entries.append(entry)
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
@ -132,10 +157,13 @@ class RobotFileParser:
return 1
# search for given user agent matches
# the first match counts
url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
# try the default entry last
if self.default_entry:
return self.default_entry.allowance(url)
# agent not found ==> access granted
return 1
@ -151,11 +179,14 @@ class RuleLine:
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path."""
def __init__(self, path, allowance):
if path == '' and not allowance:
# an empty value means allow all
allowance = 1
self.path = urllib.quote(path)
self.allowance = allowance
def applies_to(self, filename):
return self.path=="*" or re.match(self.path, filename)
return self.path=="*" or filename.startswith(self.path)
def __str__(self):
return (self.allowance and "Allow" or "Disallow")+": "+self.path
@ -184,8 +215,7 @@ class Entry:
# we have the catch-all agent
return 1
agent = agent.lower()
# don't forget to re.escape
if re.search(re.escape(useragent), agent):
if useragent.find(agent) != -1:
return 1
return 0