robotparser

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@219 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-04 21:04:41 +00:00 · 2001-01-05 11:42:11 +00:00 · 2001-01-05 11:42:11 +00:00 · 77b88c4a4e
commit 77b88c4a4e
parent c9b63453d1
8 changed files with 744 additions and 40 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,6 @@
 include MANIFEST.in
 include README FAQ INSTALL LICENSE TODO draft-gilman-news-url-00.txt
+include norobots-rfc.html
 include linkcheckerrc linkchecker linkchecker.bat linkchecker.1 create.sql
 include lc.cgi lc.fcgi lc.sz_fcgi
 include Makefile
@ -8,7 +9,7 @@ include debian/rules debian/changelog debian/copyright debian/control
 include debian/dirs debian/docs debian/links debian/postinst
 include debian/prerm
 include DNS/README
-include test/viewprof.py test/profiletest.py test/*.html
+include test/viewprof.py test/profiletest.py test/*.html test/robots.txt
 include rpm_build_script
 recursive-include locale *.mo
 recursive-include po *.po *.py Makefile
--- a/2
+++ b/2
@ -1 +1,3 @@
 Feature complete, only fixes.
+add test/robots.txt
+add norobots-rfc.html
--- a/debian/changelog
+++ b/debian/changelog
@ -3,9 +3,12 @@ linkchecker (1.2.13) unstable; urgency=low
  * linkcheck/HttpUrlData.py:
    - better redirection handling
    - really use host variable in "Host:" header
-  * linkcheck/robotparser2.py: better redirection handling
+    - support response code "305 Use proxy"
+  * linkcheck/robotparser2.py:
+    - better redirection handling
+    - cope with user-agent: lines without a preceding blank line

- -- Bastian Kleineidam <calvin@users.sourceforge.net>  Fri,  5 Jan 2001 00:34:21 +0100
+ -- Bastian Kleineidam <calvin@users.sourceforge.net>  Fri,  5 Jan 2001 10:51:59 +0100

 linkchecker (1.2.12) unstable; urgency=low

--- a/debian/rules
+++ b/debian/rules
@ -47,7 +47,7 @@ install: build
 	install -d -m 755 $(DOCDIR)/examples
 	install -c -m 644 DNS/README $(DOCDIR)/README_DNS.txt
 	install -d -m 755 $(DOCDIR)/tests/linkcheck
-	install -c -m 644 test/*.html test/*.py $(DOCDIR)/tests/linkcheck
+	install -c -m 644 test/*.html test/*.py test/robots.txt $(DOCDIR)/tests/linkcheck
 	install -d -m 755 $(DOCDIR)/tests/dns
 	install -c -m 644 tests/*.py $(DOCDIR)/tests/dns
 	# install system wide configuration file in etc
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -86,6 +86,11 @@ class HttpUrlData(UrlData):
        Config.debug(str(status)+", "+str(statusText)+", "+str(self.mime)+"\n")
        has301status = 0
        while 1:
+            # proxy enforcement
+            if status == 305 and self.mime:
+                status, statusText, self.mime = self._getHttpRequest(
+		                             proxy=self.mime.get("Location"))
+
            # follow redirections
            tries = 0
            redirected = self.urlName
@ -155,17 +160,19 @@ class HttpUrlData(UrlData):
                self.setValid("OK")

        
-    def _getHttpRequest(self, method="HEAD"):
+    def _getHttpRequest(self, method="HEAD", proxy=None):
        "Put request and return (status code, status text, mime object)"
-        if self.proxy:
-            Config.debug("DEBUG: using proxy %s\n" % self.proxy)
-            host = self.proxy
+        if self.proxy and not proxy:
+            proxy = self.proxy
+        if proxy:
+            Config.debug("DEBUG: using proxy %s\n" % proxy)
+            host = proxy
        else:
            host = self.urlTuple[1]
        if self.urlConnection:
            self.closeConnection()
        self.urlConnection = self._getHTTPObject(host)
-        if self.proxy:
+        if proxy:
            path = urlparse.urlunparse(self.urlTuple)
        else:
            path = urlparse.urlunparse(('', '', self.urlTuple[2],
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -2,20 +2,6 @@

    Copyright (C) 2000  Bastian Kleineidam

-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
    The robots.txt Exclusion Protocol is implemented as specified in
    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
 """
@ -50,7 +36,6 @@ class RobotFileParser:
        import httplib
        tries = 0
        while tries<5:
-            _debug(self.host+self.path)
            connection = httplib.HTTP(self.host)
            connection.putrequest("GET", self.path)
            connection.putheader("Host", self.host)
@ -72,7 +57,9 @@ class RobotFileParser:
            self.parse(connection.getfile().readlines())

    def parse(self, lines):
-        """parse the input lines from a robot.txt file"""
+        """parse the input lines from a robot.txt file.
+	   We allow that a user-agent: line is not preceded by
+	   one or more blank lines."""
        state = 0
        linenumber = 0
        entry = Entry()
@ -82,7 +69,9 @@ class RobotFileParser:
            linenumber = linenumber + 1
            if not line:
                if state==1:
-                    _debug("line %d: no rules found" % linenumber)
+                    _debug("line %d: warning: you should insert"
+		           " allow: or disallow: directives below any"
+			   " user-agent: line" % linenumber)
                    entry = Entry()
                    state = 0
                elif state==2:
@ -102,29 +91,31 @@ class RobotFileParser:
                line[1] = string.strip(line[1])
                if line[0] == "user-agent":
                    if state==2:
-                        _debug("line %d: user-agent in the middle of "
-			           "rules" % linenumber)
-                    else:
-                        entry.useragents.append(string.strip(line[1]))
-                        state = 1
+                        _debug("line %d: warning: you should insert a blank"
+			       " line before any user-agent"
+                               " directive" % linenumber)
+                        self.entries.append(entry)
+                        entry = Entry()
+                    entry.useragents.append(line[1])
+                    state = 1
                elif line[0] == "disallow":
                    if state==0:
-                        _debug("line %d: disallow without user "
-			           "agents" % linenumber)
+                        _debug("line %d: error: you must insert a user-agent:"
+			       " directive before this line" % linenumber)
                    else:
                        entry.rulelines.append(RuleLine(line[1], 0))
                        state = 2
                elif line[0] == "allow":
                    if state==0:
-                        _debug("line %d: allow without user "
-			           "agents" % linenumber)
+                        _debug("line %d: error: you must insert a user-agent:"
+			       " directive before this line" % linenumber)
                    else:
                        entry.rulelines.append(RuleLine(line[1], 1))
                else:
-                    _debug("line %d: unknown key %s" % (linenumber,
+                    _debug("line %d: warning: unknown key %s" % (linenumber,
                               line[0]))
            else:
-                _debug("line %d: malformed line %s" % (linenumber, line))
+                _debug("line %d: error: malformed line %s"%(linenumber, line))
        if state==2:
            self.entries.append(entry)
        _debug("Parsed rules:\n%s" % str(self))
@ -154,7 +145,10 @@ class RobotFileParser:
            ret = ret + str(entry) + "\n"
        return ret

+
 class RuleLine:
+    """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
+       (allowance==0) followed by a path."""
    def __init__(self, path, allowance):
        self.path = urllib.quote(path)
        self.allowance = allowance
@ -163,10 +157,11 @@ class RuleLine:
        return self.path=="*" or re.match(self.path, filename)

    def __str__(self):
-        return (self.allowance and "Disallow" or "Allow")+": "+self.path
+        return (self.allowance and "Allow" or "Disallow")+": "+self.path


 class Entry:
+    """An entry has one or more user-agents and zero or more rulelines"""
    def __init__(self):
        self.useragents = []
        self.rulelines = []
@ -191,7 +186,7 @@ class Entry:
    def allowance(self, filename):
        """Preconditions:
        - our agent applies to this entry
-        - file is URL decoded"""
+        - filename is URL decoded"""
        for line in self.rulelines:
            if line.applies_to(filename):
                return line.allowance
@ -208,7 +203,6 @@ def _test():
        rp.read()
    else:
        rp.parse(open(sys.argv[1]).readlines())
-    print rp
    print rp.can_fetch('*', 'http://www.musi-cal.com/')
    print rp.can_fetch('Musi-Cal-Robot/1.0',
                       'http://www.musi-cal.com/cgi-bin/event-search'
--- a/norobots-rfc.html
+++ b/norobots-rfc.html
@ -0,0 +1,651 @@
+<html>
+<body>
+<head>
+<title>
+A Standard for Robot Exclusion
+</title>
+</head>
+<body bgcolor=white>
+
+<div align=right>
+<font size="+1" color=maroon>
+<i>
+The Web Robots Pages
+<a href="robots.html"><img
+src="lt.gif"
+border=0 WIDTH=9 HEIGHT=12></a>
+</i>
+</font>
+</div>
+<hr>
+<pre>
+
+
+
+
+
+Network Working Group                                          M. Koster
+INTERNET DRAFT                                                WebCrawler
+Category: Informational                                    November 1996
+Dec 4, 1996                                         Expires June 4, 1997
+&lt;draft-koster-robots-00.txt&gt;
+
+                      A Method for Web Robots Control
+
+
+Status of this Memo
+
+     This document is an Internet-Draft.  Internet-Drafts are
+     working documents of the Internet Engineering Task Force
+     (IETF), its areas, and its working groups.  Note that other
+     groups may also distribute working documents as Internet-
+     Drafts.
+
+     Internet-Drafts are draft documents valid for a maximum of six
+     months and may be updated, replaced, or obsoleted by other
+     documents at any time.  It is inappropriate to use Internet-
+     Drafts as reference material or to cite them other than as
+     ``work in progress.''
+
+     To learn the current status of any Internet-Draft, please
+     check the ``1id-abstracts.txt'' listing contained in the
+     Internet- Drafts Shadow Directories on ftp.is.co.za (Africa),
+     nic.nordu.net (Europe), munnari.oz.au (Pacific Rim),
+     ds.internic.net (US East Coast), or ftp.isi.edu (US West
+     Coast).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Koster                draft-koster-robots-00.txt                [Page 1]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+
+Table of Contents
+
+   1.    Abstract  . . . . . . . . . . . . . . . . . . . . . . . . . 2
+   2.    Introduction  . . . . . . . . . . . . . . . . . . . . . . . 2
+   3.    Specification . . . . . . . . . . . . . . . . . . . . . . . 3
+   3.1   Access method . . . . . . . . . . . . . . . . . . . . . . . 3
+   3.2   File Format Description . . . . . . . . . . . . . . . . . . 4
+   3.2.1 The User-agent line . . . . . . . . . . . . . . . . . . . . 5
+   3.2.2 The Allow and Disallow lines . . . . . . . . . . .  . . . . 5
+   3.3   Formal Syntax . . . . . . . . . . . . . . . . . . . . . . . 6
+   3.4   Expiration . . . . . . . . . . . . .  . . . . . . . . . . . 8
+   4.    Examples . . . . . . . . . . . . . .  . . . . . . . . . . . 8
+   5.    Implementor's Notes . . . . . . . . . . . . . . . . . . . . 9
+   5.1   Backwards Compatibility . . . . . . . . . . . . . . . . . . 9
+   5.2   Interoperability . . .. . . . . . . . . . . . . . . . . . . 10
+   6.    Security Considerations . . . . . . . . . . . . . . . . . . 10
+   7.    References  . . . . . . . . . . . . . . . . . . . . . . . . 10
+   8.    Acknowledgements  . . . . . . . . . . . . . . . . . . . . . 11
+   9.    Author's Address  . . . . . . . . . . . . . . . . . . . . . 11
+
+
+1.  Abstract
+
+   This memo defines a method for administrators of sites on the World-
+   Wide Web to give instructions to visiting Web robots, most
+   importantly what areas of the site are to be avoided.
+
+   This document provides a more rigid specification of the Standard 
+   for Robots Exclusion [1], which is currently in wide-spread use by
+   the Web community since 1994.
+
+
+2.  Introduction
+
+   Web Robots (also called "Wanderers" or "Spiders") are Web client
+   programs that automatically traverse the Web's hypertext structure
+   by retrieving a document, and recursively retrieving all documents
+   that are referenced.
+
+   Note that "recursively" here doesn't limit the definition to any
+   specific traversal algorithm; even if a robot applies some heuristic
+   to the selection and order of documents to visit and spaces out
+   requests over a long space of time, it qualifies to be called a
+   robot.
+
+   Robots are often used for maintenance and indexing purposes, by
+   people other than the administrators of the site being visited. In
+   some cases such visits may have undesirable effects which the
+
+
+
+Koster                draft-koster-robots-00.txt                [Page 2]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+
+   administrators would like to prevent, such as indexing of an
+   unannounced site, traversal of parts of the site which require vast
+   resources of the server, recursive traversal of an infinite URL
+   space, etc.
+
+   The technique specified in this memo allows Web site administrators
+   to indicate to visiting robots which parts of the site should be
+   avoided. It is solely up to the visiting robot to consult this
+   information and act accordingly. Blocking parts of the Web site
+   regardless of a robot's compliance with this method are outside
+   the scope of this memo.
+   
+   
+3. The Specification
+
+   This memo specifies a format for encoding instructions to visiting
+   robots, and specifies an access method to retrieve these
+   instructions. Robots must retrieve these instructions before visiting
+   other URLs on the site, and use the instructions to determine if
+   other URLs on the site can be accessed.
+
+3.1 Access method
+
+   The instructions must be accessible via HTTP [2] from the site that
+   the instructions are to be applied to, as a resource of Internet
+   Media Type [3] "text/plain" under a standard relative path on the
+   server: "/robots.txt".
+
+   For convenience we will refer to this resource as the "/robots.txt
+   file", though the resource need in fact not originate from a file-
+   system.
+
+   Some examples of URLs [4] for sites and URLs for corresponding
+   "/robots.txt" sites:
+
+     http://www.foo.com/welcome.html http://www.foo.com/robots.txt
+
+     http://www.bar.com:8001/        http://www.bar.com:8001/robots.txt
+
+   If the server response indicates Success (HTTP 2xx Status Code,)
+   the robot must read the content, parse it, and follow any
+   instructions applicable to that robot.
+
+   If the server response indicates the resource does not exist (HTTP
+   Status Code 404), the robot can assume no instructions are
+   available, and that access to the site is not restricted by
+   /robots.txt.
+
+
+
+
+Koster                draft-koster-robots-00.txt                [Page 3]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+
+   Specific behaviors for other server responses are not required by
+   this specification, though the following behaviours are recommended:
+
+     - On server response indicating access restrictions (HTTP Status
+       Code 401 or 403) a robot should regard access to the site
+       completely restricted.
+
+     - On the request attempt resulted in temporary failure a robot
+       should defer visits to the site until such time as the resource
+       can be retrieved.
+  
+     - On server response indicating Redirection (HTTP Status Code 3XX)
+       a robot should follow the redirects until a resource can be
+       found.
+
+
+3.2 File Format Description
+
+   The instructions are encoded as a formatted plain text object,
+   described here. A complete BNF-like description of the syntax of this
+   format is given in section 3.3.
+  
+   The format logically consists of a non-empty set or records,
+   separated by blank lines. The records consist of a set of lines of
+   the form:
+  
+     &lt;Field&gt; ":" &lt;value&gt;
+  
+   In this memo we refer to lines with a Field "foo" as "foo lines".
+
+   The record starts with one or more User-agent lines, specifying
+   which robots the record applies to, followed by "Disallow" and
+   "Allow" instructions to that robot. For example:
+  
+     User-agent: webcrawler
+     User-agent: infoseek
+     Allow:    /tmp/ok.html
+     Disallow: /tmp
+     Disallow: /user/foo
+    
+   These lines are discussed separately below.
+   
+   Lines with Fields not explicitly specified by this specification
+   may occur in the /robots.txt, allowing for future extension of the
+   format. Consult the BNF for restrictions on the syntax of such
+   extensions. Note specifically that for backwards compatibility 
+   with robots implementing earlier versions of this specification,
+   breaking of lines is not allowed.
+
+
+   
+Koster                draft-koster-robots-00.txt                [Page 4]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+
+   Comments are allowed anywhere in the file, and consist of optional
+   whitespace, followed by a comment character '#' followed by the
+   comment, terminated by the end-of-line.
+  
+3.2.1 The User-agent line
+
+   Name tokens are used to allow robots to identify themselves via a
+   simple product token. Name tokens should be short and to the
+   point. The name token a robot chooses for itself should be sent
+   as part of the HTTP User-agent header, and must be well documented.
+
+   These name tokens are used in User-agent lines in /robots.txt to
+   identify to which specific robots the record applies. The robot
+   must obey the first record in /robots.txt that contains a User-
+   Agent line whose value contains the name token of the robot as a 
+   substring. The name comparisons are case-insensitive. If no such
+   record exists, it should obey the first record with a User-agent
+   line with a "*" value, if present. If no record satisfied either
+   condition, or no records are present at all, access is unlimited.
+
+   The name comparisons are case-insensitive.
+  
+   For example, a fictional company FigTree Search Services who names
+   their robot "Fig Tree", send HTTP requests like:
+  
+     GET / HTTP/1.0
+     User-agent: FigTree/0.1 Robot libwww-perl/5.04
+    
+   might scan the "/robots.txt" file for records with:
+  
+     User-agent: figtree
+  
+3.2.2 The Allow and Disallow lines
+
+   These lines indicate whether accessing a URL that matches the
+   corresponding path is allowed or disallowed. Note that these
+   instructions apply to any HTTP method on a URL.
+  
+   To evaluate if access to a URL is allowed, a robot must attempt to
+   match the paths in Allow and Disallow lines against the URL, in the
+   order they occur in the record. The first match found is used. If no
+   match is found, the default assumption is that the URL is allowed.
+
+   The /robots.txt URL is always allowed, and must not appear in the
+   Allow/Disallow rules.
+
+   The matching process compares every octet in the path portion of
+   the URL and the path from the record. If a %xx encoded octet is
+
+
+
+Koster                draft-koster-robots-00.txt                [Page 5]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+
+   encountered it is unencoded prior to comparison, unless it is the
+   "/" character, which has special meaning in a path. The match
+   evaluates positively if and only if the end of the path from the
+   record is reached before a difference in octets is encountered.
+
+   This table illustrates some examples:
+  
+     Record Path        URL path         Matches
+     /tmp               /tmp               yes
+     /tmp               /tmp.html          yes
+     /tmp               /tmp/a.html        yes
+     /tmp/              /tmp               no
+     /tmp/              /tmp/              yes
+     /tmp/              /tmp/a.html        yes
+     
+     /a%3cd.html        /a%3cd.html        yes
+     /a%3Cd.html        /a%3cd.html        yes
+     /a%3cd.html        /a%3Cd.html        yes
+     /a%3Cd.html        /a%3Cd.html        yes
+     
+     /a%2fb.html        /a%2fb.html        yes
+     /a%2fb.html        /a/b.html          no
+     /a/b.html          /a%2fb.html        no
+     /a/b.html          /a/b.html          yes
+     
+     /%7ejoe/index.html /~joe/index.html   yes
+     /~joe/index.html   /%7Ejoe/index.html yes
+    
+3.3 Formal Syntax
+
+  This is a BNF-like description, using the conventions of RFC 822 [5],
+  except that "|" is used to designate alternatives.  Briefly, literals
+  are quoted with "", parentheses "(" and ")" are used to group
+  elements, optional elements are enclosed in [brackets], and elements
+  may be preceded with &lt;n&gt;* to designate n or more repetitions of the
+  following element; n defaults to 0.
+
+    robotstxt    = *blankcomment
+                 | *blankcomment record *( 1*commentblank 1*record )
+                   *blankcomment
+    blankcomment = 1*(blank | commentline)
+    commentblank = *commentline blank *(blankcomment)
+    blank        = *space CRLF
+    CRLF         = CR LF
+    record       = *commentline agentline *(commentline | agentline)
+                   1*ruleline *(commentline | ruleline)
+
+
+
+
+
+Koster                draft-koster-robots-00.txt                [Page 6]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+
+    agentline    = "User-agent:" *space agent  [comment] CRLF
+    ruleline     = (disallowline | allowline | extension)
+    disallowline = "Disallow" ":" *space path [comment] CRLF
+    allowline    = "Allow" ":" *space rpath [comment] CRLF
+    extension    = token : *space value [comment] CRLF
+    value        = &lt;any CHAR except CR or LF or "#"&gt;
+
+    commentline  = comment CRLF
+    comment      = *blank "#" anychar
+    space        = 1*(SP | HT)
+    rpath        = "/" path
+    agent        = token
+    anychar      = &lt;any CHAR except CR or LF&gt;
+    CHAR         = &lt;any US-ASCII character (octets 0 - 127)&gt;
+    CTL          = &lt;any US-ASCII control character
+                        (octets 0 - 31) and DEL (127)&gt;
+    CR           = &lt;US-ASCII CR, carriage return (13)&gt;
+    LF           = &lt;US-ASCII LF, linefeed (10)&gt;
+    SP           = &lt;US-ASCII SP, space (32)&gt;
+    HT           = &lt;US-ASCII HT, horizontal-tab (9)&gt;
+
+   The syntax for "token" is taken from RFC 1945 [2], reproduced here for
+   convenience:
+   
+    token        = 1*&lt;any CHAR except CTLs or tspecials&gt;
+
+    tspecials    = "(" | ")" | "&lt;" | "&gt;" | "@"
+                 | "," | ";" | ":" | "\" | &lt;"&gt;
+                 | "/" | "[" | "]" | "?" | "="
+                 | "{" | "}" | SP | HT
+
+  The syntax for "path" is defined in RFC 1808 [6], reproduced here for
+  convenience:
+
+    path        = fsegment *( "/" segment )
+    fsegment    = 1*pchar
+    segment     =  *pchar
+
+    pchar       = uchar | ":" | "@" | "&amp;" | "="
+    uchar       = unreserved | escape
+    unreserved  = alpha | digit | safe | extra
+
+    escape      = "%" hex hex
+    hex         = digit | "A" | "B" | "C" | "D" | "E" | "F" |
+                         "a" | "b" | "c" | "d" | "e" | "f"
+
+    alpha       = lowalpha | hialpha
+
+
+
+
+Koster                draft-koster-robots-00.txt                [Page 7]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+    lowalpha    = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
+                  "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
+                  "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
+    hialpha     = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
+                  "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
+                  "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
+
+    digit       = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
+                  "8" | "9"
+
+    safe        = "$" | "-" | "_" | "." | "+"
+    extra       = "!" | "*" | "'" | "(" | ")" | ","
+
+                   
+3.4 Expiration
+
+   Robots should cache /robots.txt files, but if they do they must
+   periodically verify the cached copy is fresh before using its
+   contents.
+
+   Standard HTTP cache-control mechanisms can be used by both origin
+   server and robots to influence the caching of the /robots.txt file.
+   Specifically robots should take note of Expires header set by the
+   origin server.
+
+   If no cache-control directives are present robots should default to
+   an expiry of 7 days.
+
+
+4. Examples
+
+   This section contains an example of how a /robots.txt may be used.
+
+   A fictional site may have the following URLs:
+
+     http://www.fict.org/
+     http://www.fict.org/index.html
+     http://www.fict.org/robots.txt
+     http://www.fict.org/server.html
+     http://www.fict.org/services/fast.html
+     http://www.fict.org/services/slow.html
+     http://www.fict.org/orgo.gif
+     http://www.fict.org/org/about.html
+     http://www.fict.org/org/plans.html
+     http://www.fict.org/%7Ejim/jim.html
+     http://www.fict.org/%7Emak/mak.html
+
+   The site may in the /robots.txt have specific rules for robots that
+   send a HTTP User-agent "UnhipBot/0.1", "WebCrawler/3.0", and
+
+
+Koster                draft-koster-robots-00.txt                [Page 8]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+   "Excite/1.0", and a set of default rules:
+
+      # /robots.txt for http://www.fict.org/
+      # comments to webmaster@fict.org
+
+      User-agent: unhipbot
+      Disallow: /
+
+      User-agent: webcrawler
+      User-agent: excite
+      Disallow: 
+
+      User-agent: *
+      Disallow: /org/plans.html
+      Allow: /org/
+      Allow: /serv
+      Allow: /~mak
+     Disallow: /
+
+   The following matrix shows which robots are allowed to access URLs:
+
+                                               unhipbot webcrawler other
+                                                        &amp; excite
+     http://www.fict.org/                         No       Yes       No
+     http://www.fict.org/index.html               No       Yes       No
+     http://www.fict.org/robots.txt               Yes      Yes       Yes
+     http://www.fict.org/server.html              No       Yes       Yes
+     http://www.fict.org/services/fast.html       No       Yes       Yes
+     http://www.fict.org/services/slow.html       No       Yes       Yes
+     http://www.fict.org/orgo.gif                 No       Yes       No
+     http://www.fict.org/org/about.html           No       Yes       Yes
+     http://www.fict.org/org/plans.html           No       Yes       No
+     http://www.fict.org/%7Ejim/jim.html          No       Yes       No
+     http://www.fict.org/%7Emak/mak.html          No       Yes       Yes
+
+
+5. Notes for Implementors
+
+5.1   Backwards Compatibility
+
+   Previous of this specification didn't provide the Allow line. The
+   introduction of the Allow line causes robots to behave slightly 
+   differently under either specification:
+   
+   If a /robots.txt contains an Allow which overrides a later occurring
+   Disallow, a robot ignoring Allow lines will not retrieve those
+   parts. This is considered acceptable because there is no requirement
+   for a robot to access URLs it is allowed to retrieve, and it is safe,
+   in that no URLs a Web site administrator wants to Disallow are be
+   allowed. It is expected this may in fact encourage robots to upgrade
+   compliance to the specification in this memo.
+
+
+Koster                draft-koster-robots-00.txt                [Page 9]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+5.2   Interoperability
+
+   Implementors should pay particular attention to the robustness in
+   parsing of the /robots.txt file. Web site administrators who are not
+   aware of the /robots.txt mechanisms often notice repeated failing
+   request for it in their log files, and react by putting up pages
+   asking "What are you looking for?".
+
+   As the majority of /robots.txt files are created with platform-
+   specific text editors, robots should be liberal in accepting files
+   with different end-of-line conventions, specifically CR and LF in
+   addition to CRLF.
+
+
+6. Security Considerations
+
+   There are a few risks in the method described here, which may affect
+   either origin server or robot.
+
+   Web site administrators must realise this method is voluntary, and
+   is not sufficient to guarantee some robots will not visit restricted
+   parts of the URL space. Failure to use proper authentication or other
+   restriction may result in exposure of restricted information. It even
+   possible that the occurence of paths in the /robots.txt file may
+   expose the existence of resources not otherwise linked to on the
+   site, which may aid people guessing for URLs.
+
+   Robots need to be aware that the amount of resources spent on dealing
+   with the /robots.txt is a function of the file contents, which is not
+   under the control of the robot. For example, the contents may be
+   larger in size than the robot can deal with. To prevent denial-of-
+   service attacks, robots are therefore encouraged to place limits on
+   the resources spent on processing of /robots.txt.
+
+   The /robots.txt directives are retrieved and applied in separate,
+   possible unauthenticated HTTP transactions, and it is possible that
+   one server can impersonate another or otherwise intercept a
+   /robots.txt, and provide a robot with false information. This
+   specification does not preclude authentication and encryption
+   from being employed to increase security.
+
+7. Acknowledgements
+
+   The author would like the subscribers to the robots mailing list for
+   their contributions to this specification.
+
+
+
+
+
+
+
+Koster                draft-koster-robots-00.txt               [Page 10]
+
+INTERNET DRAFT        A Method for Robots Control       December 4, 1996
+
+8. References
+
+   [1] Koster, M., "A Standard for Robot Exclusion", 
+       http://info.webcrawler.com/mak/projects/robots/norobots.html,
+       June 1994.
+   
+   [2] Berners-Lee, T., Fielding, R., and Frystyk, H., "Hypertext
+       Transfer Protocol -- HTTP/1.0." RFC 1945, MIT/LCS, May 1996.
+       
+   [3] Postel, J., "Media Type Registration Procedure." RFC 1590,
+        USC/ISI, March 1994.
+   
+   [4]  Berners-Lee, T., Masinter, L., and M. McCahill, "Uniform
+        Resource Locators (URL)", RFC 1738, CERN, Xerox PARC,
+        University of Minnesota, December 1994.
+
+   [5] Crocker, D., "Standard for the Format of ARPA Internet Text
+       Messages", STD 11, RFC 822, UDEL, August 1982.
+
+   [6] Fielding, R., "Relative Uniform Resource Locators", RFC 1808,
+       UC Irvine, June 1995.
+
+9. Author's Address
+
+   Martijn Koster
+   WebCrawler
+   America Online
+   690 Fifth Street
+   San Francisco
+   CA 94107
+   
+   Phone: 415-3565431
+   EMail: m.koster@webcrawler.com
+
+                                                    Expires June 4, 1997
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Koster                draft-koster-robots-00.txt               [Page 11]
+</pre>
+<hr>
+<div align=right>
+<address>
+<small>
+<A href="http://info.webcrawler.com/mak/projects/robots/robots.html">The
+Web Robots Pages</A>
+</small>
+</address>
+</div>
+</body>
+</html>
+
--- a/test/robots.txt
+++ b/test/robots.txt
@ -0,0 +1,46 @@
+# /robots.txt for http://www.musi-cal.com/
+# See http://info.webcrawler.com/mak/projects/robots/norobots.html
+# Skip Montanaro (skip@mojam.com)
+# - adapted from the robots.txt file at http://web.nexor.co.uk/
+
+# by default
+User-agent: *
+Disallow: /ccrd			# not useful to spiders
+Disallow: /click		# not useful to spiders
+Disallow: /search		# dynamic
+Disallow: /hc			# dynamic
+Disallow: /subbatch		# dynamic
+Disallow: /vadd			# dynamic
+Disallow: /vsearch		# dynamic
+Disallow: /vedit		# dynamic
+Disallow: /vdelete		# dynamic
+Disallow: /cgi-bin		# dynamic
+Disallow: /images/		# useless images
+Disallow: /icons/		# useless images
+Disallow: /concerts/		# deprecated URL form
+Disallow: /conferences		# defunct
+Disallow: /musician		# defunct
+Disallow: /~skip/volkswagen	# defunct
+Disallow: /%7Eskip/volkswagen	# defunct
+
+# disallow a bunch of ill-behaved user agents (doubt this will deter them...)
+User-agent: ExtractorPro
+Disallow: /
+User-agent: EmailSiphon
+Disallow: /
+User-agent: EmailWolf 
+Disallow: /
+User-agent: CherryPickerSE/1.0
+Disallow: /
+User-agent: CherryPickerElite/1.0
+Disallow: /
+User-agent: EmailCollector/1.0
+Disallow: /
+User-agent: EmailWolf 1.00
+Disallow: /
+User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0
+Disallow: /
+User-agent: EmailSiphon
+Disallow: /
+User-agent: Mozilla/2.0 (compatible; NEWT ActiveX; Win32)
+Disallow: /