updated

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@988 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-04 04:44:42 +00:00 · 2003-08-07 22:36:32 +00:00 · 2003-08-07 22:36:32 +00:00 · e560a650cf
commit e560a650cf
parent 525dfdf4c6
1 changed files with 38 additions and 8 deletions
--- a/linkcheck/robotparser.py
+++ b/linkcheck/robotparser.py
@ -10,7 +10,7 @@
    The robots.txt Exclusion Protocol is implemented as specified in
    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
 """
-import re,urlparse,urllib
+import urlparse, urllib

 __all__ = ["RobotFileParser"]

@ -21,25 +21,43 @@ def _debug(msg):


 class RobotFileParser:
+    """ This class provides a set of methods to read, parse and answer
+    questions about a single robots.txt file.
+
+    """
+
    def __init__(self, url=''):
        self.entries = []
+        self.default_entry = None
        self.disallow_all = 0
        self.allow_all = 0
        self.set_url(url)
        self.last_checked = 0

    def mtime(self):
+        """Returns the time the robots.txt file was last fetched.
+
+        This is useful for long-running web spiders that need to
+        check for new robots.txt files periodically.
+
+        """
        return self.last_checked

    def modified(self):
+        """Sets the time the robots.txt file was last fetched to the
+        current time.
+
+        """
        import time
        self.last_checked = time.time()

    def set_url(self, url):
+        """Sets the URL referring to a robots.txt file."""
        self.url = url
        self.host, self.path = urlparse.urlparse(url)[1:3]

    def read(self):
+        """Reads the robots.txt URL and feeds it to the parser."""
        opener = URLopener()
        f = opener.open(self.url)
        lines = []
@ -58,6 +76,13 @@ class RobotFileParser:
            _debug("parse lines")
            self.parse(lines)

+    def _add_entry(self, entry):
+        if "*" in entry.useragents:
+            # the default entry is considered last
+            self.default_entry = entry
+        else:
+            self.entries.append(entry)
+
    def parse(self, lines):
        """parse the input lines from a robot.txt file.
           We allow that a user-agent: line is not preceded by
@ -76,7 +101,7 @@ class RobotFileParser:
                    entry = Entry()
                    state = 0
                elif state==2:
-                    self.entries.append(entry)
+                    self._add_entry(entry)
                    entry = Entry()
                    state = 0
            # remove optional comment and strip line
@ -89,13 +114,13 @@ class RobotFileParser:
            line = line.split(':', 1)
            if len(line) == 2:
                line[0] = line[0].strip().lower()
-                line[1] = line[1].strip()
+                line[1] = urllib.unquote(line[1].strip())
                if line[0] == "user-agent":
                    if state==2:
                        _debug("line %d: warning: you should insert a blank"
                               " line before any user-agent"
                               " directive" % linenumber)
-                        self.entries.append(entry)
+                        self._add_entry(entry)
                        entry = Entry()
                    entry.useragents.append(line[1])
                    state = 1
@ -132,10 +157,13 @@ class RobotFileParser:
            return 1
        # search for given user agent matches
        # the first match counts
-        url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
+        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)
+        # try the default entry last
+        if self.default_entry:
+            return self.default_entry.allowance(url)
        # agent not found ==> access granted
        return 1

@ -151,11 +179,14 @@ class RuleLine:
    """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
       (allowance==0) followed by a path."""
    def __init__(self, path, allowance):
+        if path == '' and not allowance:
+            # an empty value means allow all
+            allowance = 1
        self.path = urllib.quote(path)
        self.allowance = allowance

    def applies_to(self, filename):
-        return self.path=="*" or re.match(self.path, filename)
+        return self.path=="*" or filename.startswith(self.path)

    def __str__(self):
        return (self.allowance and "Allow" or "Disallow")+": "+self.path
@ -184,8 +215,7 @@ class Entry:
                # we have the catch-all agent
                return 1
            agent = agent.lower()
-            # don't forget to re.escape
-            if re.search(re.escape(useragent), agent):
+            if useragent.find(agent) != -1:
                return 1
        return 0