mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-22 15:14:44 +00:00
syntax updated
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1374 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
1e0f66fdd3
commit
5ad8c827b4
1 changed files with 6 additions and 17 deletions
|
|
@ -29,11 +29,11 @@ class RobotFileParser (object):
|
|||
questions about a single robots.txt file.
|
||||
|
||||
"""
|
||||
|
||||
def __init__ (self, url=''):
|
||||
self.set_url(url)
|
||||
self._reset()
|
||||
|
||||
|
||||
def _reset (self):
|
||||
self.entries = []
|
||||
self.default_entry = None
|
||||
|
|
@ -41,7 +41,6 @@ class RobotFileParser (object):
|
|||
self.allow_all = False
|
||||
self.last_checked = 0
|
||||
|
||||
|
||||
def mtime (self):
|
||||
"""Returns the time the robots.txt file was last fetched.
|
||||
|
||||
|
|
@ -51,7 +50,6 @@ class RobotFileParser (object):
|
|||
"""
|
||||
return self.last_checked
|
||||
|
||||
|
||||
def modified (self):
|
||||
"""Sets the time the robots.txt file was last fetched to the
|
||||
current time.
|
||||
|
|
@ -60,13 +58,11 @@ class RobotFileParser (object):
|
|||
import time
|
||||
self.last_checked = time.time()
|
||||
|
||||
|
||||
def set_url (self, url):
|
||||
"""Sets the URL referring to a robots.txt file."""
|
||||
self.url = url
|
||||
self.host, self.path = urlparse.urlparse(url)[1:3]
|
||||
|
||||
|
||||
def read (self):
|
||||
"""Reads the robots.txt URL and feeds it to the parser."""
|
||||
self._reset()
|
||||
|
|
@ -108,7 +104,6 @@ class RobotFileParser (object):
|
|||
debug(BRING_IT_ON, "robots.txt parse lines")
|
||||
self.parse(lines)
|
||||
|
||||
|
||||
def _add_entry (self, entry):
|
||||
if "*" in entry.useragents:
|
||||
# the default entry is considered last
|
||||
|
|
@ -116,7 +111,6 @@ class RobotFileParser (object):
|
|||
else:
|
||||
self.entries.append(entry)
|
||||
|
||||
|
||||
def parse (self, lines):
|
||||
"""parse the input lines from a robot.txt file.
|
||||
We allow that a user-agent: line is not preceded by
|
||||
|
|
@ -180,7 +174,6 @@ class RobotFileParser (object):
|
|||
self.entries.append(entry)
|
||||
debug(BRING_IT_ON, "Parsed rules:\n%s" % str(self))
|
||||
|
||||
|
||||
def can_fetch (self, useragent, url):
|
||||
"""using the parsed robots.txt decide if useragent can fetch url"""
|
||||
debug(BRING_IT_ON, "Checking robot.txt allowance for:\n user agent: %r\n url: %r"%(useragent, url))
|
||||
|
|
@ -200,7 +193,6 @@ class RobotFileParser (object):
|
|||
# agent not found ==> access granted
|
||||
return True
|
||||
|
||||
|
||||
def __str__ (self):
|
||||
lines = [str(entry) for entry in self.entries]
|
||||
if self.default_entry is not None:
|
||||
|
|
@ -208,9 +200,10 @@ class RobotFileParser (object):
|
|||
return "\n\n".join(lines)
|
||||
|
||||
|
||||
class RuleLine:
|
||||
class RuleLine (object):
|
||||
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
|
||||
(allowance==0) followed by a path."""
|
||||
|
||||
def __init__ (self, path, allowance):
|
||||
if path == '' and not allowance:
|
||||
# an empty value means allow all
|
||||
|
|
@ -218,28 +211,25 @@ class RuleLine:
|
|||
self.path = urllib.quote(path)
|
||||
self.allowance = allowance
|
||||
|
||||
|
||||
def applies_to (self, filename):
|
||||
return self.path=="*" or filename.startswith(self.path)
|
||||
|
||||
|
||||
def __str__ (self):
|
||||
return (self.allowance and "Allow" or "Disallow")+": "+self.path
|
||||
|
||||
|
||||
class Entry:
|
||||
class Entry (object):
|
||||
"""An entry has one or more user-agents and zero or more rulelines"""
|
||||
|
||||
def __init__ (self):
|
||||
self.useragents = []
|
||||
self.rulelines = []
|
||||
|
||||
|
||||
def __str__ (self):
|
||||
lines = ["User-agent: %r"%agent for agent in self.useragents]
|
||||
lines.extend([str(line) for line in self.rulelines])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def applies_to (self, useragent):
|
||||
"""check if this entry applies to the specified agent"""
|
||||
# split the name token and make it lower case
|
||||
|
|
@ -255,7 +245,6 @@ class Entry:
|
|||
return True
|
||||
return False
|
||||
|
||||
|
||||
def allowance (self, filename):
|
||||
"""Preconditions:
|
||||
- our agent applies to this entry
|
||||
|
|
@ -300,7 +289,7 @@ def decode (page):
|
|||
else:
|
||||
fp = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content))
|
||||
except zlib.error, msg:
|
||||
warn(linkcheck.i18n._("%r at %s, assuming non-compressed content") % (str(msg), page.geturl()))
|
||||
warn(bk.i18n._("%r at %s, assuming non-compressed content") % (str(msg), page.geturl()))
|
||||
fp = StringIO.StringIO(content)
|
||||
# remove content-encoding header
|
||||
headers = {}
|
||||
|
|
|
|||
Loading…
Reference in a new issue