robotparser

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@219 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2001-01-05 11:42:11 +00:00
parent c9b63453d1
commit 77b88c4a4e
8 changed files with 744 additions and 40 deletions

View file

@ -1,5 +1,6 @@
include MANIFEST.in
include README FAQ INSTALL LICENSE TODO draft-gilman-news-url-00.txt
include norobots-rfc.html
include linkcheckerrc linkchecker linkchecker.bat linkchecker.1 create.sql
include lc.cgi lc.fcgi lc.sz_fcgi
include Makefile
@ -8,7 +9,7 @@ include debian/rules debian/changelog debian/copyright debian/control
include debian/dirs debian/docs debian/links debian/postinst
include debian/prerm
include DNS/README
include test/viewprof.py test/profiletest.py test/*.html
include test/viewprof.py test/profiletest.py test/*.html test/robots.txt
include rpm_build_script
recursive-include locale *.mo
recursive-include po *.po *.py Makefile

2
TODO
View file

@ -1 +1,3 @@
Feature complete, only fixes.
add test/robots.txt
add norobots-rfc.html

7
debian/changelog vendored
View file

@ -3,9 +3,12 @@ linkchecker (1.2.13) unstable; urgency=low
* linkcheck/HttpUrlData.py:
- better redirection handling
- really use host variable in "Host:" header
* linkcheck/robotparser2.py: better redirection handling
- support response code "305 Use proxy"
* linkcheck/robotparser2.py:
- better redirection handling
- cope with user-agent: lines without a preceding blank line
-- Bastian Kleineidam <calvin@users.sourceforge.net> Fri, 5 Jan 2001 00:34:21 +0100
-- Bastian Kleineidam <calvin@users.sourceforge.net> Fri, 5 Jan 2001 10:51:59 +0100
linkchecker (1.2.12) unstable; urgency=low

2
debian/rules vendored
View file

@ -47,7 +47,7 @@ install: build
install -d -m 755 $(DOCDIR)/examples
install -c -m 644 DNS/README $(DOCDIR)/README_DNS.txt
install -d -m 755 $(DOCDIR)/tests/linkcheck
install -c -m 644 test/*.html test/*.py $(DOCDIR)/tests/linkcheck
install -c -m 644 test/*.html test/*.py test/robots.txt $(DOCDIR)/tests/linkcheck
install -d -m 755 $(DOCDIR)/tests/dns
install -c -m 644 tests/*.py $(DOCDIR)/tests/dns
# install system wide configuration file in etc

View file

@ -86,6 +86,11 @@ class HttpUrlData(UrlData):
Config.debug(str(status)+", "+str(statusText)+", "+str(self.mime)+"\n")
has301status = 0
while 1:
# proxy enforcement
if status == 305 and self.mime:
status, statusText, self.mime = self._getHttpRequest(
proxy=self.mime.get("Location"))
# follow redirections
tries = 0
redirected = self.urlName
@ -155,17 +160,19 @@ class HttpUrlData(UrlData):
self.setValid("OK")
def _getHttpRequest(self, method="HEAD"):
def _getHttpRequest(self, method="HEAD", proxy=None):
"Put request and return (status code, status text, mime object)"
if self.proxy:
Config.debug("DEBUG: using proxy %s\n" % self.proxy)
host = self.proxy
if self.proxy and not proxy:
proxy = self.proxy
if proxy:
Config.debug("DEBUG: using proxy %s\n" % proxy)
host = proxy
else:
host = self.urlTuple[1]
if self.urlConnection:
self.closeConnection()
self.urlConnection = self._getHTTPObject(host)
if self.proxy:
if proxy:
path = urlparse.urlunparse(self.urlTuple)
else:
path = urlparse.urlunparse(('', '', self.urlTuple[2],

View file

@ -2,20 +2,6 @@
Copyright (C) 2000 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
"""
@ -50,7 +36,6 @@ class RobotFileParser:
import httplib
tries = 0
while tries<5:
_debug(self.host+self.path)
connection = httplib.HTTP(self.host)
connection.putrequest("GET", self.path)
connection.putheader("Host", self.host)
@ -72,7 +57,9 @@ class RobotFileParser:
self.parse(connection.getfile().readlines())
def parse(self, lines):
"""parse the input lines from a robot.txt file"""
"""parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines."""
state = 0
linenumber = 0
entry = Entry()
@ -82,7 +69,9 @@ class RobotFileParser:
linenumber = linenumber + 1
if not line:
if state==1:
_debug("line %d: no rules found" % linenumber)
_debug("line %d: warning: you should insert"
" allow: or disallow: directives below any"
" user-agent: line" % linenumber)
entry = Entry()
state = 0
elif state==2:
@ -102,29 +91,31 @@ class RobotFileParser:
line[1] = string.strip(line[1])
if line[0] == "user-agent":
if state==2:
_debug("line %d: user-agent in the middle of "
"rules" % linenumber)
else:
entry.useragents.append(string.strip(line[1]))
state = 1
_debug("line %d: warning: you should insert a blank"
" line before any user-agent"
" directive" % linenumber)
self.entries.append(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
elif line[0] == "disallow":
if state==0:
_debug("line %d: disallow without user "
"agents" % linenumber)
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
entry.rulelines.append(RuleLine(line[1], 0))
state = 2
elif line[0] == "allow":
if state==0:
_debug("line %d: allow without user "
"agents" % linenumber)
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
entry.rulelines.append(RuleLine(line[1], 1))
else:
_debug("line %d: unknown key %s" % (linenumber,
_debug("line %d: warning: unknown key %s" % (linenumber,
line[0]))
else:
_debug("line %d: malformed line %s" % (linenumber, line))
_debug("line %d: error: malformed line %s"%(linenumber, line))
if state==2:
self.entries.append(entry)
_debug("Parsed rules:\n%s" % str(self))
@ -154,7 +145,10 @@ class RobotFileParser:
ret = ret + str(entry) + "\n"
return ret
class RuleLine:
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path."""
def __init__(self, path, allowance):
self.path = urllib.quote(path)
self.allowance = allowance
@ -163,10 +157,11 @@ class RuleLine:
return self.path=="*" or re.match(self.path, filename)
def __str__(self):
return (self.allowance and "Disallow" or "Allow")+": "+self.path
return (self.allowance and "Allow" or "Disallow")+": "+self.path
class Entry:
"""An entry has one or more user-agents and zero or more rulelines"""
def __init__(self):
self.useragents = []
self.rulelines = []
@ -191,7 +186,7 @@ class Entry:
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- file is URL decoded"""
- filename is URL decoded"""
for line in self.rulelines:
if line.applies_to(filename):
return line.allowance
@ -208,7 +203,6 @@ def _test():
rp.read()
else:
rp.parse(open(sys.argv[1]).readlines())
print rp
print rp.can_fetch('*', 'http://www.musi-cal.com/')
print rp.can_fetch('Musi-Cal-Robot/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'

651
norobots-rfc.html Normal file
View file

@ -0,0 +1,651 @@
<html>
<body>
<head>
<title>
A Standard for Robot Exclusion
</title>
</head>
<body bgcolor=white>
<div align=right>
<font size="+1" color=maroon>
<i>
The Web Robots Pages
<a href="robots.html"><img
src="lt.gif"
border=0 WIDTH=9 HEIGHT=12></a>
</i>
</font>
</div>
<hr>
<pre>
Network Working Group M. Koster
INTERNET DRAFT WebCrawler
Category: Informational November 1996
Dec 4, 1996 Expires June 4, 1997
&lt;draft-koster-robots-00.txt&gt;
A Method for Web Robots Control
Status of this Memo
This document is an Internet-Draft. Internet-Drafts are
working documents of the Internet Engineering Task Force
(IETF), its areas, and its working groups. Note that other
groups may also distribute working documents as Internet-
Drafts.
Internet-Drafts are draft documents valid for a maximum of six
months and may be updated, replaced, or obsoleted by other
documents at any time. It is inappropriate to use Internet-
Drafts as reference material or to cite them other than as
``work in progress.''
To learn the current status of any Internet-Draft, please
check the ``1id-abstracts.txt'' listing contained in the
Internet- Drafts Shadow Directories on ftp.is.co.za (Africa),
nic.nordu.net (Europe), munnari.oz.au (Pacific Rim),
ds.internic.net (US East Coast), or ftp.isi.edu (US West
Coast).
Koster draft-koster-robots-00.txt [Page 1]
INTERNET DRAFT A Method for Robots Control December 4, 1996
Table of Contents
1. Abstract . . . . . . . . . . . . . . . . . . . . . . . . . 2
2. Introduction . . . . . . . . . . . . . . . . . . . . . . . 2
3. Specification . . . . . . . . . . . . . . . . . . . . . . . 3
3.1 Access method . . . . . . . . . . . . . . . . . . . . . . . 3
3.2 File Format Description . . . . . . . . . . . . . . . . . . 4
3.2.1 The User-agent line . . . . . . . . . . . . . . . . . . . . 5
3.2.2 The Allow and Disallow lines . . . . . . . . . . . . . . . 5
3.3 Formal Syntax . . . . . . . . . . . . . . . . . . . . . . . 6
3.4 Expiration . . . . . . . . . . . . . . . . . . . . . . . . 8
4. Examples . . . . . . . . . . . . . . . . . . . . . . . . . 8
5. Implementor's Notes . . . . . . . . . . . . . . . . . . . . 9
5.1 Backwards Compatibility . . . . . . . . . . . . . . . . . . 9
5.2 Interoperability . . .. . . . . . . . . . . . . . . . . . . 10
6. Security Considerations . . . . . . . . . . . . . . . . . . 10
7. References . . . . . . . . . . . . . . . . . . . . . . . . 10
8. Acknowledgements . . . . . . . . . . . . . . . . . . . . . 11
9. Author's Address . . . . . . . . . . . . . . . . . . . . . 11
1. Abstract
This memo defines a method for administrators of sites on the World-
Wide Web to give instructions to visiting Web robots, most
importantly what areas of the site are to be avoided.
This document provides a more rigid specification of the Standard
for Robots Exclusion [1], which is currently in wide-spread use by
the Web community since 1994.
2. Introduction
Web Robots (also called "Wanderers" or "Spiders") are Web client
programs that automatically traverse the Web's hypertext structure
by retrieving a document, and recursively retrieving all documents
that are referenced.
Note that "recursively" here doesn't limit the definition to any
specific traversal algorithm; even if a robot applies some heuristic
to the selection and order of documents to visit and spaces out
requests over a long space of time, it qualifies to be called a
robot.
Robots are often used for maintenance and indexing purposes, by
people other than the administrators of the site being visited. In
some cases such visits may have undesirable effects which the
Koster draft-koster-robots-00.txt [Page 2]
INTERNET DRAFT A Method for Robots Control December 4, 1996
administrators would like to prevent, such as indexing of an
unannounced site, traversal of parts of the site which require vast
resources of the server, recursive traversal of an infinite URL
space, etc.
The technique specified in this memo allows Web site administrators
to indicate to visiting robots which parts of the site should be
avoided. It is solely up to the visiting robot to consult this
information and act accordingly. Blocking parts of the Web site
regardless of a robot's compliance with this method are outside
the scope of this memo.
3. The Specification
This memo specifies a format for encoding instructions to visiting
robots, and specifies an access method to retrieve these
instructions. Robots must retrieve these instructions before visiting
other URLs on the site, and use the instructions to determine if
other URLs on the site can be accessed.
3.1 Access method
The instructions must be accessible via HTTP [2] from the site that
the instructions are to be applied to, as a resource of Internet
Media Type [3] "text/plain" under a standard relative path on the
server: "/robots.txt".
For convenience we will refer to this resource as the "/robots.txt
file", though the resource need in fact not originate from a file-
system.
Some examples of URLs [4] for sites and URLs for corresponding
"/robots.txt" sites:
http://www.foo.com/welcome.html http://www.foo.com/robots.txt
http://www.bar.com:8001/ http://www.bar.com:8001/robots.txt
If the server response indicates Success (HTTP 2xx Status Code,)
the robot must read the content, parse it, and follow any
instructions applicable to that robot.
If the server response indicates the resource does not exist (HTTP
Status Code 404), the robot can assume no instructions are
available, and that access to the site is not restricted by
/robots.txt.
Koster draft-koster-robots-00.txt [Page 3]
INTERNET DRAFT A Method for Robots Control December 4, 1996
Specific behaviors for other server responses are not required by
this specification, though the following behaviours are recommended:
- On server response indicating access restrictions (HTTP Status
Code 401 or 403) a robot should regard access to the site
completely restricted.
- On the request attempt resulted in temporary failure a robot
should defer visits to the site until such time as the resource
can be retrieved.
- On server response indicating Redirection (HTTP Status Code 3XX)
a robot should follow the redirects until a resource can be
found.
3.2 File Format Description
The instructions are encoded as a formatted plain text object,
described here. A complete BNF-like description of the syntax of this
format is given in section 3.3.
The format logically consists of a non-empty set or records,
separated by blank lines. The records consist of a set of lines of
the form:
&lt;Field&gt; ":" &lt;value&gt;
In this memo we refer to lines with a Field "foo" as "foo lines".
The record starts with one or more User-agent lines, specifying
which robots the record applies to, followed by "Disallow" and
"Allow" instructions to that robot. For example:
User-agent: webcrawler
User-agent: infoseek
Allow: /tmp/ok.html
Disallow: /tmp
Disallow: /user/foo
These lines are discussed separately below.
Lines with Fields not explicitly specified by this specification
may occur in the /robots.txt, allowing for future extension of the
format. Consult the BNF for restrictions on the syntax of such
extensions. Note specifically that for backwards compatibility
with robots implementing earlier versions of this specification,
breaking of lines is not allowed.
Koster draft-koster-robots-00.txt [Page 4]
INTERNET DRAFT A Method for Robots Control December 4, 1996
Comments are allowed anywhere in the file, and consist of optional
whitespace, followed by a comment character '#' followed by the
comment, terminated by the end-of-line.
3.2.1 The User-agent line
Name tokens are used to allow robots to identify themselves via a
simple product token. Name tokens should be short and to the
point. The name token a robot chooses for itself should be sent
as part of the HTTP User-agent header, and must be well documented.
These name tokens are used in User-agent lines in /robots.txt to
identify to which specific robots the record applies. The robot
must obey the first record in /robots.txt that contains a User-
Agent line whose value contains the name token of the robot as a
substring. The name comparisons are case-insensitive. If no such
record exists, it should obey the first record with a User-agent
line with a "*" value, if present. If no record satisfied either
condition, or no records are present at all, access is unlimited.
The name comparisons are case-insensitive.
For example, a fictional company FigTree Search Services who names
their robot "Fig Tree", send HTTP requests like:
GET / HTTP/1.0
User-agent: FigTree/0.1 Robot libwww-perl/5.04
might scan the "/robots.txt" file for records with:
User-agent: figtree
3.2.2 The Allow and Disallow lines
These lines indicate whether accessing a URL that matches the
corresponding path is allowed or disallowed. Note that these
instructions apply to any HTTP method on a URL.
To evaluate if access to a URL is allowed, a robot must attempt to
match the paths in Allow and Disallow lines against the URL, in the
order they occur in the record. The first match found is used. If no
match is found, the default assumption is that the URL is allowed.
The /robots.txt URL is always allowed, and must not appear in the
Allow/Disallow rules.
The matching process compares every octet in the path portion of
the URL and the path from the record. If a %xx encoded octet is
Koster draft-koster-robots-00.txt [Page 5]
INTERNET DRAFT A Method for Robots Control December 4, 1996
encountered it is unencoded prior to comparison, unless it is the
"/" character, which has special meaning in a path. The match
evaluates positively if and only if the end of the path from the
record is reached before a difference in octets is encountered.
This table illustrates some examples:
Record Path URL path Matches
/tmp /tmp yes
/tmp /tmp.html yes
/tmp /tmp/a.html yes
/tmp/ /tmp no
/tmp/ /tmp/ yes
/tmp/ /tmp/a.html yes
/a%3cd.html /a%3cd.html yes
/a%3Cd.html /a%3cd.html yes
/a%3cd.html /a%3Cd.html yes
/a%3Cd.html /a%3Cd.html yes
/a%2fb.html /a%2fb.html yes
/a%2fb.html /a/b.html no
/a/b.html /a%2fb.html no
/a/b.html /a/b.html yes
/%7ejoe/index.html /~joe/index.html yes
/~joe/index.html /%7Ejoe/index.html yes
3.3 Formal Syntax
This is a BNF-like description, using the conventions of RFC 822 [5],
except that "|" is used to designate alternatives. Briefly, literals
are quoted with "", parentheses "(" and ")" are used to group
elements, optional elements are enclosed in [brackets], and elements
may be preceded with &lt;n&gt;* to designate n or more repetitions of the
following element; n defaults to 0.
robotstxt = *blankcomment
| *blankcomment record *( 1*commentblank 1*record )
*blankcomment
blankcomment = 1*(blank | commentline)
commentblank = *commentline blank *(blankcomment)
blank = *space CRLF
CRLF = CR LF
record = *commentline agentline *(commentline | agentline)
1*ruleline *(commentline | ruleline)
Koster draft-koster-robots-00.txt [Page 6]
INTERNET DRAFT A Method for Robots Control December 4, 1996
agentline = "User-agent:" *space agent [comment] CRLF
ruleline = (disallowline | allowline | extension)
disallowline = "Disallow" ":" *space path [comment] CRLF
allowline = "Allow" ":" *space rpath [comment] CRLF
extension = token : *space value [comment] CRLF
value = &lt;any CHAR except CR or LF or "#"&gt;
commentline = comment CRLF
comment = *blank "#" anychar
space = 1*(SP | HT)
rpath = "/" path
agent = token
anychar = &lt;any CHAR except CR or LF&gt;
CHAR = &lt;any US-ASCII character (octets 0 - 127)&gt;
CTL = &lt;any US-ASCII control character
(octets 0 - 31) and DEL (127)&gt;
CR = &lt;US-ASCII CR, carriage return (13)&gt;
LF = &lt;US-ASCII LF, linefeed (10)&gt;
SP = &lt;US-ASCII SP, space (32)&gt;
HT = &lt;US-ASCII HT, horizontal-tab (9)&gt;
The syntax for "token" is taken from RFC 1945 [2], reproduced here for
convenience:
token = 1*&lt;any CHAR except CTLs or tspecials&gt;
tspecials = "(" | ")" | "&lt;" | "&gt;" | "@"
| "," | ";" | ":" | "\" | &lt;"&gt;
| "/" | "[" | "]" | "?" | "="
| "{" | "}" | SP | HT
The syntax for "path" is defined in RFC 1808 [6], reproduced here for
convenience:
path = fsegment *( "/" segment )
fsegment = 1*pchar
segment = *pchar
pchar = uchar | ":" | "@" | "&amp;" | "="
uchar = unreserved | escape
unreserved = alpha | digit | safe | extra
escape = "%" hex hex
hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
"a" | "b" | "c" | "d" | "e" | "f"
alpha = lowalpha | hialpha
Koster draft-koster-robots-00.txt [Page 7]
INTERNET DRAFT A Method for Robots Control December 4, 1996
lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
"j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
"s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
hialpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
"J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
"S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
"8" | "9"
safe = "$" | "-" | "_" | "." | "+"
extra = "!" | "*" | "'" | "(" | ")" | ","
3.4 Expiration
Robots should cache /robots.txt files, but if they do they must
periodically verify the cached copy is fresh before using its
contents.
Standard HTTP cache-control mechanisms can be used by both origin
server and robots to influence the caching of the /robots.txt file.
Specifically robots should take note of Expires header set by the
origin server.
If no cache-control directives are present robots should default to
an expiry of 7 days.
4. Examples
This section contains an example of how a /robots.txt may be used.
A fictional site may have the following URLs:
http://www.fict.org/
http://www.fict.org/index.html
http://www.fict.org/robots.txt
http://www.fict.org/server.html
http://www.fict.org/services/fast.html
http://www.fict.org/services/slow.html
http://www.fict.org/orgo.gif
http://www.fict.org/org/about.html
http://www.fict.org/org/plans.html
http://www.fict.org/%7Ejim/jim.html
http://www.fict.org/%7Emak/mak.html
The site may in the /robots.txt have specific rules for robots that
send a HTTP User-agent "UnhipBot/0.1", "WebCrawler/3.0", and
Koster draft-koster-robots-00.txt [Page 8]
INTERNET DRAFT A Method for Robots Control December 4, 1996
"Excite/1.0", and a set of default rules:
# /robots.txt for http://www.fict.org/
# comments to webmaster@fict.org
User-agent: unhipbot
Disallow: /
User-agent: webcrawler
User-agent: excite
Disallow:
User-agent: *
Disallow: /org/plans.html
Allow: /org/
Allow: /serv
Allow: /~mak
Disallow: /
The following matrix shows which robots are allowed to access URLs:
unhipbot webcrawler other
&amp; excite
http://www.fict.org/ No Yes No
http://www.fict.org/index.html No Yes No
http://www.fict.org/robots.txt Yes Yes Yes
http://www.fict.org/server.html No Yes Yes
http://www.fict.org/services/fast.html No Yes Yes
http://www.fict.org/services/slow.html No Yes Yes
http://www.fict.org/orgo.gif No Yes No
http://www.fict.org/org/about.html No Yes Yes
http://www.fict.org/org/plans.html No Yes No
http://www.fict.org/%7Ejim/jim.html No Yes No
http://www.fict.org/%7Emak/mak.html No Yes Yes
5. Notes for Implementors
5.1 Backwards Compatibility
Previous of this specification didn't provide the Allow line. The
introduction of the Allow line causes robots to behave slightly
differently under either specification:
If a /robots.txt contains an Allow which overrides a later occurring
Disallow, a robot ignoring Allow lines will not retrieve those
parts. This is considered acceptable because there is no requirement
for a robot to access URLs it is allowed to retrieve, and it is safe,
in that no URLs a Web site administrator wants to Disallow are be
allowed. It is expected this may in fact encourage robots to upgrade
compliance to the specification in this memo.
Koster draft-koster-robots-00.txt [Page 9]
INTERNET DRAFT A Method for Robots Control December 4, 1996
5.2 Interoperability
Implementors should pay particular attention to the robustness in
parsing of the /robots.txt file. Web site administrators who are not
aware of the /robots.txt mechanisms often notice repeated failing
request for it in their log files, and react by putting up pages
asking "What are you looking for?".
As the majority of /robots.txt files are created with platform-
specific text editors, robots should be liberal in accepting files
with different end-of-line conventions, specifically CR and LF in
addition to CRLF.
6. Security Considerations
There are a few risks in the method described here, which may affect
either origin server or robot.
Web site administrators must realise this method is voluntary, and
is not sufficient to guarantee some robots will not visit restricted
parts of the URL space. Failure to use proper authentication or other
restriction may result in exposure of restricted information. It even
possible that the occurence of paths in the /robots.txt file may
expose the existence of resources not otherwise linked to on the
site, which may aid people guessing for URLs.
Robots need to be aware that the amount of resources spent on dealing
with the /robots.txt is a function of the file contents, which is not
under the control of the robot. For example, the contents may be
larger in size than the robot can deal with. To prevent denial-of-
service attacks, robots are therefore encouraged to place limits on
the resources spent on processing of /robots.txt.
The /robots.txt directives are retrieved and applied in separate,
possible unauthenticated HTTP transactions, and it is possible that
one server can impersonate another or otherwise intercept a
/robots.txt, and provide a robot with false information. This
specification does not preclude authentication and encryption
from being employed to increase security.
7. Acknowledgements
The author would like the subscribers to the robots mailing list for
their contributions to this specification.
Koster draft-koster-robots-00.txt [Page 10]
INTERNET DRAFT A Method for Robots Control December 4, 1996
8. References
[1] Koster, M., "A Standard for Robot Exclusion",
http://info.webcrawler.com/mak/projects/robots/norobots.html,
June 1994.
[2] Berners-Lee, T., Fielding, R., and Frystyk, H., "Hypertext
Transfer Protocol -- HTTP/1.0." RFC 1945, MIT/LCS, May 1996.
[3] Postel, J., "Media Type Registration Procedure." RFC 1590,
USC/ISI, March 1994.
[4] Berners-Lee, T., Masinter, L., and M. McCahill, "Uniform
Resource Locators (URL)", RFC 1738, CERN, Xerox PARC,
University of Minnesota, December 1994.
[5] Crocker, D., "Standard for the Format of ARPA Internet Text
Messages", STD 11, RFC 822, UDEL, August 1982.
[6] Fielding, R., "Relative Uniform Resource Locators", RFC 1808,
UC Irvine, June 1995.
9. Author's Address
Martijn Koster
WebCrawler
America Online
690 Fifth Street
San Francisco
CA 94107
Phone: 415-3565431
EMail: m.koster@webcrawler.com
Expires June 4, 1997
Koster draft-koster-robots-00.txt [Page 11]
</pre>
<hr>
<div align=right>
<address>
<small>
<A href="http://info.webcrawler.com/mak/projects/robots/robots.html">The
Web Robots Pages</A>
</small>
</address>
</div>
</body>
</html>

46
test/robots.txt Normal file
View file

@ -0,0 +1,46 @@
# /robots.txt for http://www.musi-cal.com/
# See http://info.webcrawler.com/mak/projects/robots/norobots.html
# Skip Montanaro (skip@mojam.com)
# - adapted from the robots.txt file at http://web.nexor.co.uk/
# by default
User-agent: *
Disallow: /ccrd # not useful to spiders
Disallow: /click # not useful to spiders
Disallow: /search # dynamic
Disallow: /hc # dynamic
Disallow: /subbatch # dynamic
Disallow: /vadd # dynamic
Disallow: /vsearch # dynamic
Disallow: /vedit # dynamic
Disallow: /vdelete # dynamic
Disallow: /cgi-bin # dynamic
Disallow: /images/ # useless images
Disallow: /icons/ # useless images
Disallow: /concerts/ # deprecated URL form
Disallow: /conferences # defunct
Disallow: /musician # defunct
Disallow: /~skip/volkswagen # defunct
Disallow: /%7Eskip/volkswagen # defunct
# disallow a bunch of ill-behaved user agents (doubt this will deter them...)
User-agent: ExtractorPro
Disallow: /
User-agent: EmailSiphon
Disallow: /
User-agent: EmailWolf
Disallow: /
User-agent: CherryPickerSE/1.0
Disallow: /
User-agent: CherryPickerElite/1.0
Disallow: /
User-agent: EmailCollector/1.0
Disallow: /
User-agent: EmailWolf 1.00
Disallow: /
User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0
Disallow: /
User-agent: EmailSiphon
Disallow: /
User-agent: Mozilla/2.0 (compatible; NEWT ActiveX; Win32)
Disallow: /