mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-04 21:04:41 +00:00
robotparser
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@219 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
c9b63453d1
commit
77b88c4a4e
8 changed files with 744 additions and 40 deletions
|
|
@ -1,5 +1,6 @@
|
|||
include MANIFEST.in
|
||||
include README FAQ INSTALL LICENSE TODO draft-gilman-news-url-00.txt
|
||||
include norobots-rfc.html
|
||||
include linkcheckerrc linkchecker linkchecker.bat linkchecker.1 create.sql
|
||||
include lc.cgi lc.fcgi lc.sz_fcgi
|
||||
include Makefile
|
||||
|
|
@ -8,7 +9,7 @@ include debian/rules debian/changelog debian/copyright debian/control
|
|||
include debian/dirs debian/docs debian/links debian/postinst
|
||||
include debian/prerm
|
||||
include DNS/README
|
||||
include test/viewprof.py test/profiletest.py test/*.html
|
||||
include test/viewprof.py test/profiletest.py test/*.html test/robots.txt
|
||||
include rpm_build_script
|
||||
recursive-include locale *.mo
|
||||
recursive-include po *.po *.py Makefile
|
||||
|
|
|
|||
2
TODO
2
TODO
|
|
@ -1 +1,3 @@
|
|||
Feature complete, only fixes.
|
||||
add test/robots.txt
|
||||
add norobots-rfc.html
|
||||
|
|
|
|||
7
debian/changelog
vendored
7
debian/changelog
vendored
|
|
@ -3,9 +3,12 @@ linkchecker (1.2.13) unstable; urgency=low
|
|||
* linkcheck/HttpUrlData.py:
|
||||
- better redirection handling
|
||||
- really use host variable in "Host:" header
|
||||
* linkcheck/robotparser2.py: better redirection handling
|
||||
- support response code "305 Use proxy"
|
||||
* linkcheck/robotparser2.py:
|
||||
- better redirection handling
|
||||
- cope with user-agent: lines without a preceding blank line
|
||||
|
||||
-- Bastian Kleineidam <calvin@users.sourceforge.net> Fri, 5 Jan 2001 00:34:21 +0100
|
||||
-- Bastian Kleineidam <calvin@users.sourceforge.net> Fri, 5 Jan 2001 10:51:59 +0100
|
||||
|
||||
linkchecker (1.2.12) unstable; urgency=low
|
||||
|
||||
|
|
|
|||
2
debian/rules
vendored
2
debian/rules
vendored
|
|
@ -47,7 +47,7 @@ install: build
|
|||
install -d -m 755 $(DOCDIR)/examples
|
||||
install -c -m 644 DNS/README $(DOCDIR)/README_DNS.txt
|
||||
install -d -m 755 $(DOCDIR)/tests/linkcheck
|
||||
install -c -m 644 test/*.html test/*.py $(DOCDIR)/tests/linkcheck
|
||||
install -c -m 644 test/*.html test/*.py test/robots.txt $(DOCDIR)/tests/linkcheck
|
||||
install -d -m 755 $(DOCDIR)/tests/dns
|
||||
install -c -m 644 tests/*.py $(DOCDIR)/tests/dns
|
||||
# install system wide configuration file in etc
|
||||
|
|
|
|||
|
|
@ -86,6 +86,11 @@ class HttpUrlData(UrlData):
|
|||
Config.debug(str(status)+", "+str(statusText)+", "+str(self.mime)+"\n")
|
||||
has301status = 0
|
||||
while 1:
|
||||
# proxy enforcement
|
||||
if status == 305 and self.mime:
|
||||
status, statusText, self.mime = self._getHttpRequest(
|
||||
proxy=self.mime.get("Location"))
|
||||
|
||||
# follow redirections
|
||||
tries = 0
|
||||
redirected = self.urlName
|
||||
|
|
@ -155,17 +160,19 @@ class HttpUrlData(UrlData):
|
|||
self.setValid("OK")
|
||||
|
||||
|
||||
def _getHttpRequest(self, method="HEAD"):
|
||||
def _getHttpRequest(self, method="HEAD", proxy=None):
|
||||
"Put request and return (status code, status text, mime object)"
|
||||
if self.proxy:
|
||||
Config.debug("DEBUG: using proxy %s\n" % self.proxy)
|
||||
host = self.proxy
|
||||
if self.proxy and not proxy:
|
||||
proxy = self.proxy
|
||||
if proxy:
|
||||
Config.debug("DEBUG: using proxy %s\n" % proxy)
|
||||
host = proxy
|
||||
else:
|
||||
host = self.urlTuple[1]
|
||||
if self.urlConnection:
|
||||
self.closeConnection()
|
||||
self.urlConnection = self._getHTTPObject(host)
|
||||
if self.proxy:
|
||||
if proxy:
|
||||
path = urlparse.urlunparse(self.urlTuple)
|
||||
else:
|
||||
path = urlparse.urlunparse(('', '', self.urlTuple[2],
|
||||
|
|
|
|||
|
|
@ -2,20 +2,6 @@
|
|||
|
||||
Copyright (C) 2000 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
The robots.txt Exclusion Protocol is implemented as specified in
|
||||
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
|
||||
"""
|
||||
|
|
@ -50,7 +36,6 @@ class RobotFileParser:
|
|||
import httplib
|
||||
tries = 0
|
||||
while tries<5:
|
||||
_debug(self.host+self.path)
|
||||
connection = httplib.HTTP(self.host)
|
||||
connection.putrequest("GET", self.path)
|
||||
connection.putheader("Host", self.host)
|
||||
|
|
@ -72,7 +57,9 @@ class RobotFileParser:
|
|||
self.parse(connection.getfile().readlines())
|
||||
|
||||
def parse(self, lines):
|
||||
"""parse the input lines from a robot.txt file"""
|
||||
"""parse the input lines from a robot.txt file.
|
||||
We allow that a user-agent: line is not preceded by
|
||||
one or more blank lines."""
|
||||
state = 0
|
||||
linenumber = 0
|
||||
entry = Entry()
|
||||
|
|
@ -82,7 +69,9 @@ class RobotFileParser:
|
|||
linenumber = linenumber + 1
|
||||
if not line:
|
||||
if state==1:
|
||||
_debug("line %d: no rules found" % linenumber)
|
||||
_debug("line %d: warning: you should insert"
|
||||
" allow: or disallow: directives below any"
|
||||
" user-agent: line" % linenumber)
|
||||
entry = Entry()
|
||||
state = 0
|
||||
elif state==2:
|
||||
|
|
@ -102,29 +91,31 @@ class RobotFileParser:
|
|||
line[1] = string.strip(line[1])
|
||||
if line[0] == "user-agent":
|
||||
if state==2:
|
||||
_debug("line %d: user-agent in the middle of "
|
||||
"rules" % linenumber)
|
||||
else:
|
||||
entry.useragents.append(string.strip(line[1]))
|
||||
state = 1
|
||||
_debug("line %d: warning: you should insert a blank"
|
||||
" line before any user-agent"
|
||||
" directive" % linenumber)
|
||||
self.entries.append(entry)
|
||||
entry = Entry()
|
||||
entry.useragents.append(line[1])
|
||||
state = 1
|
||||
elif line[0] == "disallow":
|
||||
if state==0:
|
||||
_debug("line %d: disallow without user "
|
||||
"agents" % linenumber)
|
||||
_debug("line %d: error: you must insert a user-agent:"
|
||||
" directive before this line" % linenumber)
|
||||
else:
|
||||
entry.rulelines.append(RuleLine(line[1], 0))
|
||||
state = 2
|
||||
elif line[0] == "allow":
|
||||
if state==0:
|
||||
_debug("line %d: allow without user "
|
||||
"agents" % linenumber)
|
||||
_debug("line %d: error: you must insert a user-agent:"
|
||||
" directive before this line" % linenumber)
|
||||
else:
|
||||
entry.rulelines.append(RuleLine(line[1], 1))
|
||||
else:
|
||||
_debug("line %d: unknown key %s" % (linenumber,
|
||||
_debug("line %d: warning: unknown key %s" % (linenumber,
|
||||
line[0]))
|
||||
else:
|
||||
_debug("line %d: malformed line %s" % (linenumber, line))
|
||||
_debug("line %d: error: malformed line %s"%(linenumber, line))
|
||||
if state==2:
|
||||
self.entries.append(entry)
|
||||
_debug("Parsed rules:\n%s" % str(self))
|
||||
|
|
@ -154,7 +145,10 @@ class RobotFileParser:
|
|||
ret = ret + str(entry) + "\n"
|
||||
return ret
|
||||
|
||||
|
||||
class RuleLine:
|
||||
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
|
||||
(allowance==0) followed by a path."""
|
||||
def __init__(self, path, allowance):
|
||||
self.path = urllib.quote(path)
|
||||
self.allowance = allowance
|
||||
|
|
@ -163,10 +157,11 @@ class RuleLine:
|
|||
return self.path=="*" or re.match(self.path, filename)
|
||||
|
||||
def __str__(self):
|
||||
return (self.allowance and "Disallow" or "Allow")+": "+self.path
|
||||
return (self.allowance and "Allow" or "Disallow")+": "+self.path
|
||||
|
||||
|
||||
class Entry:
|
||||
"""An entry has one or more user-agents and zero or more rulelines"""
|
||||
def __init__(self):
|
||||
self.useragents = []
|
||||
self.rulelines = []
|
||||
|
|
@ -191,7 +186,7 @@ class Entry:
|
|||
def allowance(self, filename):
|
||||
"""Preconditions:
|
||||
- our agent applies to this entry
|
||||
- file is URL decoded"""
|
||||
- filename is URL decoded"""
|
||||
for line in self.rulelines:
|
||||
if line.applies_to(filename):
|
||||
return line.allowance
|
||||
|
|
@ -208,7 +203,6 @@ def _test():
|
|||
rp.read()
|
||||
else:
|
||||
rp.parse(open(sys.argv[1]).readlines())
|
||||
print rp
|
||||
print rp.can_fetch('*', 'http://www.musi-cal.com/')
|
||||
print rp.can_fetch('Musi-Cal-Robot/1.0',
|
||||
'http://www.musi-cal.com/cgi-bin/event-search'
|
||||
|
|
|
|||
651
norobots-rfc.html
Normal file
651
norobots-rfc.html
Normal file
|
|
@ -0,0 +1,651 @@
|
|||
<html>
|
||||
<body>
|
||||
<head>
|
||||
<title>
|
||||
A Standard for Robot Exclusion
|
||||
</title>
|
||||
</head>
|
||||
<body bgcolor=white>
|
||||
|
||||
<div align=right>
|
||||
<font size="+1" color=maroon>
|
||||
<i>
|
||||
The Web Robots Pages
|
||||
<a href="robots.html"><img
|
||||
src="lt.gif"
|
||||
border=0 WIDTH=9 HEIGHT=12></a>
|
||||
</i>
|
||||
</font>
|
||||
</div>
|
||||
<hr>
|
||||
<pre>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Network Working Group M. Koster
|
||||
INTERNET DRAFT WebCrawler
|
||||
Category: Informational November 1996
|
||||
Dec 4, 1996 Expires June 4, 1997
|
||||
<draft-koster-robots-00.txt>
|
||||
|
||||
A Method for Web Robots Control
|
||||
|
||||
|
||||
Status of this Memo
|
||||
|
||||
This document is an Internet-Draft. Internet-Drafts are
|
||||
working documents of the Internet Engineering Task Force
|
||||
(IETF), its areas, and its working groups. Note that other
|
||||
groups may also distribute working documents as Internet-
|
||||
Drafts.
|
||||
|
||||
Internet-Drafts are draft documents valid for a maximum of six
|
||||
months and may be updated, replaced, or obsoleted by other
|
||||
documents at any time. It is inappropriate to use Internet-
|
||||
Drafts as reference material or to cite them other than as
|
||||
``work in progress.''
|
||||
|
||||
To learn the current status of any Internet-Draft, please
|
||||
check the ``1id-abstracts.txt'' listing contained in the
|
||||
Internet- Drafts Shadow Directories on ftp.is.co.za (Africa),
|
||||
nic.nordu.net (Europe), munnari.oz.au (Pacific Rim),
|
||||
ds.internic.net (US East Coast), or ftp.isi.edu (US West
|
||||
Coast).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 1]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
|
||||
Table of Contents
|
||||
|
||||
1. Abstract . . . . . . . . . . . . . . . . . . . . . . . . . 2
|
||||
2. Introduction . . . . . . . . . . . . . . . . . . . . . . . 2
|
||||
3. Specification . . . . . . . . . . . . . . . . . . . . . . . 3
|
||||
3.1 Access method . . . . . . . . . . . . . . . . . . . . . . . 3
|
||||
3.2 File Format Description . . . . . . . . . . . . . . . . . . 4
|
||||
3.2.1 The User-agent line . . . . . . . . . . . . . . . . . . . . 5
|
||||
3.2.2 The Allow and Disallow lines . . . . . . . . . . . . . . . 5
|
||||
3.3 Formal Syntax . . . . . . . . . . . . . . . . . . . . . . . 6
|
||||
3.4 Expiration . . . . . . . . . . . . . . . . . . . . . . . . 8
|
||||
4. Examples . . . . . . . . . . . . . . . . . . . . . . . . . 8
|
||||
5. Implementor's Notes . . . . . . . . . . . . . . . . . . . . 9
|
||||
5.1 Backwards Compatibility . . . . . . . . . . . . . . . . . . 9
|
||||
5.2 Interoperability . . .. . . . . . . . . . . . . . . . . . . 10
|
||||
6. Security Considerations . . . . . . . . . . . . . . . . . . 10
|
||||
7. References . . . . . . . . . . . . . . . . . . . . . . . . 10
|
||||
8. Acknowledgements . . . . . . . . . . . . . . . . . . . . . 11
|
||||
9. Author's Address . . . . . . . . . . . . . . . . . . . . . 11
|
||||
|
||||
|
||||
1. Abstract
|
||||
|
||||
This memo defines a method for administrators of sites on the World-
|
||||
Wide Web to give instructions to visiting Web robots, most
|
||||
importantly what areas of the site are to be avoided.
|
||||
|
||||
This document provides a more rigid specification of the Standard
|
||||
for Robots Exclusion [1], which is currently in wide-spread use by
|
||||
the Web community since 1994.
|
||||
|
||||
|
||||
2. Introduction
|
||||
|
||||
Web Robots (also called "Wanderers" or "Spiders") are Web client
|
||||
programs that automatically traverse the Web's hypertext structure
|
||||
by retrieving a document, and recursively retrieving all documents
|
||||
that are referenced.
|
||||
|
||||
Note that "recursively" here doesn't limit the definition to any
|
||||
specific traversal algorithm; even if a robot applies some heuristic
|
||||
to the selection and order of documents to visit and spaces out
|
||||
requests over a long space of time, it qualifies to be called a
|
||||
robot.
|
||||
|
||||
Robots are often used for maintenance and indexing purposes, by
|
||||
people other than the administrators of the site being visited. In
|
||||
some cases such visits may have undesirable effects which the
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 2]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
|
||||
administrators would like to prevent, such as indexing of an
|
||||
unannounced site, traversal of parts of the site which require vast
|
||||
resources of the server, recursive traversal of an infinite URL
|
||||
space, etc.
|
||||
|
||||
The technique specified in this memo allows Web site administrators
|
||||
to indicate to visiting robots which parts of the site should be
|
||||
avoided. It is solely up to the visiting robot to consult this
|
||||
information and act accordingly. Blocking parts of the Web site
|
||||
regardless of a robot's compliance with this method are outside
|
||||
the scope of this memo.
|
||||
|
||||
|
||||
3. The Specification
|
||||
|
||||
This memo specifies a format for encoding instructions to visiting
|
||||
robots, and specifies an access method to retrieve these
|
||||
instructions. Robots must retrieve these instructions before visiting
|
||||
other URLs on the site, and use the instructions to determine if
|
||||
other URLs on the site can be accessed.
|
||||
|
||||
3.1 Access method
|
||||
|
||||
The instructions must be accessible via HTTP [2] from the site that
|
||||
the instructions are to be applied to, as a resource of Internet
|
||||
Media Type [3] "text/plain" under a standard relative path on the
|
||||
server: "/robots.txt".
|
||||
|
||||
For convenience we will refer to this resource as the "/robots.txt
|
||||
file", though the resource need in fact not originate from a file-
|
||||
system.
|
||||
|
||||
Some examples of URLs [4] for sites and URLs for corresponding
|
||||
"/robots.txt" sites:
|
||||
|
||||
http://www.foo.com/welcome.html http://www.foo.com/robots.txt
|
||||
|
||||
http://www.bar.com:8001/ http://www.bar.com:8001/robots.txt
|
||||
|
||||
If the server response indicates Success (HTTP 2xx Status Code,)
|
||||
the robot must read the content, parse it, and follow any
|
||||
instructions applicable to that robot.
|
||||
|
||||
If the server response indicates the resource does not exist (HTTP
|
||||
Status Code 404), the robot can assume no instructions are
|
||||
available, and that access to the site is not restricted by
|
||||
/robots.txt.
|
||||
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 3]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
|
||||
Specific behaviors for other server responses are not required by
|
||||
this specification, though the following behaviours are recommended:
|
||||
|
||||
- On server response indicating access restrictions (HTTP Status
|
||||
Code 401 or 403) a robot should regard access to the site
|
||||
completely restricted.
|
||||
|
||||
- On the request attempt resulted in temporary failure a robot
|
||||
should defer visits to the site until such time as the resource
|
||||
can be retrieved.
|
||||
|
||||
- On server response indicating Redirection (HTTP Status Code 3XX)
|
||||
a robot should follow the redirects until a resource can be
|
||||
found.
|
||||
|
||||
|
||||
3.2 File Format Description
|
||||
|
||||
The instructions are encoded as a formatted plain text object,
|
||||
described here. A complete BNF-like description of the syntax of this
|
||||
format is given in section 3.3.
|
||||
|
||||
The format logically consists of a non-empty set or records,
|
||||
separated by blank lines. The records consist of a set of lines of
|
||||
the form:
|
||||
|
||||
<Field> ":" <value>
|
||||
|
||||
In this memo we refer to lines with a Field "foo" as "foo lines".
|
||||
|
||||
The record starts with one or more User-agent lines, specifying
|
||||
which robots the record applies to, followed by "Disallow" and
|
||||
"Allow" instructions to that robot. For example:
|
||||
|
||||
User-agent: webcrawler
|
||||
User-agent: infoseek
|
||||
Allow: /tmp/ok.html
|
||||
Disallow: /tmp
|
||||
Disallow: /user/foo
|
||||
|
||||
These lines are discussed separately below.
|
||||
|
||||
Lines with Fields not explicitly specified by this specification
|
||||
may occur in the /robots.txt, allowing for future extension of the
|
||||
format. Consult the BNF for restrictions on the syntax of such
|
||||
extensions. Note specifically that for backwards compatibility
|
||||
with robots implementing earlier versions of this specification,
|
||||
breaking of lines is not allowed.
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 4]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
|
||||
Comments are allowed anywhere in the file, and consist of optional
|
||||
whitespace, followed by a comment character '#' followed by the
|
||||
comment, terminated by the end-of-line.
|
||||
|
||||
3.2.1 The User-agent line
|
||||
|
||||
Name tokens are used to allow robots to identify themselves via a
|
||||
simple product token. Name tokens should be short and to the
|
||||
point. The name token a robot chooses for itself should be sent
|
||||
as part of the HTTP User-agent header, and must be well documented.
|
||||
|
||||
These name tokens are used in User-agent lines in /robots.txt to
|
||||
identify to which specific robots the record applies. The robot
|
||||
must obey the first record in /robots.txt that contains a User-
|
||||
Agent line whose value contains the name token of the robot as a
|
||||
substring. The name comparisons are case-insensitive. If no such
|
||||
record exists, it should obey the first record with a User-agent
|
||||
line with a "*" value, if present. If no record satisfied either
|
||||
condition, or no records are present at all, access is unlimited.
|
||||
|
||||
The name comparisons are case-insensitive.
|
||||
|
||||
For example, a fictional company FigTree Search Services who names
|
||||
their robot "Fig Tree", send HTTP requests like:
|
||||
|
||||
GET / HTTP/1.0
|
||||
User-agent: FigTree/0.1 Robot libwww-perl/5.04
|
||||
|
||||
might scan the "/robots.txt" file for records with:
|
||||
|
||||
User-agent: figtree
|
||||
|
||||
3.2.2 The Allow and Disallow lines
|
||||
|
||||
These lines indicate whether accessing a URL that matches the
|
||||
corresponding path is allowed or disallowed. Note that these
|
||||
instructions apply to any HTTP method on a URL.
|
||||
|
||||
To evaluate if access to a URL is allowed, a robot must attempt to
|
||||
match the paths in Allow and Disallow lines against the URL, in the
|
||||
order they occur in the record. The first match found is used. If no
|
||||
match is found, the default assumption is that the URL is allowed.
|
||||
|
||||
The /robots.txt URL is always allowed, and must not appear in the
|
||||
Allow/Disallow rules.
|
||||
|
||||
The matching process compares every octet in the path portion of
|
||||
the URL and the path from the record. If a %xx encoded octet is
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 5]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
|
||||
encountered it is unencoded prior to comparison, unless it is the
|
||||
"/" character, which has special meaning in a path. The match
|
||||
evaluates positively if and only if the end of the path from the
|
||||
record is reached before a difference in octets is encountered.
|
||||
|
||||
This table illustrates some examples:
|
||||
|
||||
Record Path URL path Matches
|
||||
/tmp /tmp yes
|
||||
/tmp /tmp.html yes
|
||||
/tmp /tmp/a.html yes
|
||||
/tmp/ /tmp no
|
||||
/tmp/ /tmp/ yes
|
||||
/tmp/ /tmp/a.html yes
|
||||
|
||||
/a%3cd.html /a%3cd.html yes
|
||||
/a%3Cd.html /a%3cd.html yes
|
||||
/a%3cd.html /a%3Cd.html yes
|
||||
/a%3Cd.html /a%3Cd.html yes
|
||||
|
||||
/a%2fb.html /a%2fb.html yes
|
||||
/a%2fb.html /a/b.html no
|
||||
/a/b.html /a%2fb.html no
|
||||
/a/b.html /a/b.html yes
|
||||
|
||||
/%7ejoe/index.html /~joe/index.html yes
|
||||
/~joe/index.html /%7Ejoe/index.html yes
|
||||
|
||||
3.3 Formal Syntax
|
||||
|
||||
This is a BNF-like description, using the conventions of RFC 822 [5],
|
||||
except that "|" is used to designate alternatives. Briefly, literals
|
||||
are quoted with "", parentheses "(" and ")" are used to group
|
||||
elements, optional elements are enclosed in [brackets], and elements
|
||||
may be preceded with <n>* to designate n or more repetitions of the
|
||||
following element; n defaults to 0.
|
||||
|
||||
robotstxt = *blankcomment
|
||||
| *blankcomment record *( 1*commentblank 1*record )
|
||||
*blankcomment
|
||||
blankcomment = 1*(blank | commentline)
|
||||
commentblank = *commentline blank *(blankcomment)
|
||||
blank = *space CRLF
|
||||
CRLF = CR LF
|
||||
record = *commentline agentline *(commentline | agentline)
|
||||
1*ruleline *(commentline | ruleline)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 6]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
|
||||
agentline = "User-agent:" *space agent [comment] CRLF
|
||||
ruleline = (disallowline | allowline | extension)
|
||||
disallowline = "Disallow" ":" *space path [comment] CRLF
|
||||
allowline = "Allow" ":" *space rpath [comment] CRLF
|
||||
extension = token : *space value [comment] CRLF
|
||||
value = <any CHAR except CR or LF or "#">
|
||||
|
||||
commentline = comment CRLF
|
||||
comment = *blank "#" anychar
|
||||
space = 1*(SP | HT)
|
||||
rpath = "/" path
|
||||
agent = token
|
||||
anychar = <any CHAR except CR or LF>
|
||||
CHAR = <any US-ASCII character (octets 0 - 127)>
|
||||
CTL = <any US-ASCII control character
|
||||
(octets 0 - 31) and DEL (127)>
|
||||
CR = <US-ASCII CR, carriage return (13)>
|
||||
LF = <US-ASCII LF, linefeed (10)>
|
||||
SP = <US-ASCII SP, space (32)>
|
||||
HT = <US-ASCII HT, horizontal-tab (9)>
|
||||
|
||||
The syntax for "token" is taken from RFC 1945 [2], reproduced here for
|
||||
convenience:
|
||||
|
||||
token = 1*<any CHAR except CTLs or tspecials>
|
||||
|
||||
tspecials = "(" | ")" | "<" | ">" | "@"
|
||||
| "," | ";" | ":" | "\" | <">
|
||||
| "/" | "[" | "]" | "?" | "="
|
||||
| "{" | "}" | SP | HT
|
||||
|
||||
The syntax for "path" is defined in RFC 1808 [6], reproduced here for
|
||||
convenience:
|
||||
|
||||
path = fsegment *( "/" segment )
|
||||
fsegment = 1*pchar
|
||||
segment = *pchar
|
||||
|
||||
pchar = uchar | ":" | "@" | "&" | "="
|
||||
uchar = unreserved | escape
|
||||
unreserved = alpha | digit | safe | extra
|
||||
|
||||
escape = "%" hex hex
|
||||
hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
|
||||
"a" | "b" | "c" | "d" | "e" | "f"
|
||||
|
||||
alpha = lowalpha | hialpha
|
||||
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 7]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
|
||||
"j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
|
||||
"s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
|
||||
hialpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
|
||||
"J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
|
||||
"S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
|
||||
|
||||
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
|
||||
"8" | "9"
|
||||
|
||||
safe = "$" | "-" | "_" | "." | "+"
|
||||
extra = "!" | "*" | "'" | "(" | ")" | ","
|
||||
|
||||
|
||||
3.4 Expiration
|
||||
|
||||
Robots should cache /robots.txt files, but if they do they must
|
||||
periodically verify the cached copy is fresh before using its
|
||||
contents.
|
||||
|
||||
Standard HTTP cache-control mechanisms can be used by both origin
|
||||
server and robots to influence the caching of the /robots.txt file.
|
||||
Specifically robots should take note of Expires header set by the
|
||||
origin server.
|
||||
|
||||
If no cache-control directives are present robots should default to
|
||||
an expiry of 7 days.
|
||||
|
||||
|
||||
4. Examples
|
||||
|
||||
This section contains an example of how a /robots.txt may be used.
|
||||
|
||||
A fictional site may have the following URLs:
|
||||
|
||||
http://www.fict.org/
|
||||
http://www.fict.org/index.html
|
||||
http://www.fict.org/robots.txt
|
||||
http://www.fict.org/server.html
|
||||
http://www.fict.org/services/fast.html
|
||||
http://www.fict.org/services/slow.html
|
||||
http://www.fict.org/orgo.gif
|
||||
http://www.fict.org/org/about.html
|
||||
http://www.fict.org/org/plans.html
|
||||
http://www.fict.org/%7Ejim/jim.html
|
||||
http://www.fict.org/%7Emak/mak.html
|
||||
|
||||
The site may in the /robots.txt have specific rules for robots that
|
||||
send a HTTP User-agent "UnhipBot/0.1", "WebCrawler/3.0", and
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 8]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
"Excite/1.0", and a set of default rules:
|
||||
|
||||
# /robots.txt for http://www.fict.org/
|
||||
# comments to webmaster@fict.org
|
||||
|
||||
User-agent: unhipbot
|
||||
Disallow: /
|
||||
|
||||
User-agent: webcrawler
|
||||
User-agent: excite
|
||||
Disallow:
|
||||
|
||||
User-agent: *
|
||||
Disallow: /org/plans.html
|
||||
Allow: /org/
|
||||
Allow: /serv
|
||||
Allow: /~mak
|
||||
Disallow: /
|
||||
|
||||
The following matrix shows which robots are allowed to access URLs:
|
||||
|
||||
unhipbot webcrawler other
|
||||
& excite
|
||||
http://www.fict.org/ No Yes No
|
||||
http://www.fict.org/index.html No Yes No
|
||||
http://www.fict.org/robots.txt Yes Yes Yes
|
||||
http://www.fict.org/server.html No Yes Yes
|
||||
http://www.fict.org/services/fast.html No Yes Yes
|
||||
http://www.fict.org/services/slow.html No Yes Yes
|
||||
http://www.fict.org/orgo.gif No Yes No
|
||||
http://www.fict.org/org/about.html No Yes Yes
|
||||
http://www.fict.org/org/plans.html No Yes No
|
||||
http://www.fict.org/%7Ejim/jim.html No Yes No
|
||||
http://www.fict.org/%7Emak/mak.html No Yes Yes
|
||||
|
||||
|
||||
5. Notes for Implementors
|
||||
|
||||
5.1 Backwards Compatibility
|
||||
|
||||
Previous of this specification didn't provide the Allow line. The
|
||||
introduction of the Allow line causes robots to behave slightly
|
||||
differently under either specification:
|
||||
|
||||
If a /robots.txt contains an Allow which overrides a later occurring
|
||||
Disallow, a robot ignoring Allow lines will not retrieve those
|
||||
parts. This is considered acceptable because there is no requirement
|
||||
for a robot to access URLs it is allowed to retrieve, and it is safe,
|
||||
in that no URLs a Web site administrator wants to Disallow are be
|
||||
allowed. It is expected this may in fact encourage robots to upgrade
|
||||
compliance to the specification in this memo.
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 9]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
5.2 Interoperability
|
||||
|
||||
Implementors should pay particular attention to the robustness in
|
||||
parsing of the /robots.txt file. Web site administrators who are not
|
||||
aware of the /robots.txt mechanisms often notice repeated failing
|
||||
request for it in their log files, and react by putting up pages
|
||||
asking "What are you looking for?".
|
||||
|
||||
As the majority of /robots.txt files are created with platform-
|
||||
specific text editors, robots should be liberal in accepting files
|
||||
with different end-of-line conventions, specifically CR and LF in
|
||||
addition to CRLF.
|
||||
|
||||
|
||||
6. Security Considerations
|
||||
|
||||
There are a few risks in the method described here, which may affect
|
||||
either origin server or robot.
|
||||
|
||||
Web site administrators must realise this method is voluntary, and
|
||||
is not sufficient to guarantee some robots will not visit restricted
|
||||
parts of the URL space. Failure to use proper authentication or other
|
||||
restriction may result in exposure of restricted information. It even
|
||||
possible that the occurence of paths in the /robots.txt file may
|
||||
expose the existence of resources not otherwise linked to on the
|
||||
site, which may aid people guessing for URLs.
|
||||
|
||||
Robots need to be aware that the amount of resources spent on dealing
|
||||
with the /robots.txt is a function of the file contents, which is not
|
||||
under the control of the robot. For example, the contents may be
|
||||
larger in size than the robot can deal with. To prevent denial-of-
|
||||
service attacks, robots are therefore encouraged to place limits on
|
||||
the resources spent on processing of /robots.txt.
|
||||
|
||||
The /robots.txt directives are retrieved and applied in separate,
|
||||
possible unauthenticated HTTP transactions, and it is possible that
|
||||
one server can impersonate another or otherwise intercept a
|
||||
/robots.txt, and provide a robot with false information. This
|
||||
specification does not preclude authentication and encryption
|
||||
from being employed to increase security.
|
||||
|
||||
7. Acknowledgements
|
||||
|
||||
The author would like the subscribers to the robots mailing list for
|
||||
their contributions to this specification.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 10]
|
||||
|
||||
INTERNET DRAFT A Method for Robots Control December 4, 1996
|
||||
|
||||
8. References
|
||||
|
||||
[1] Koster, M., "A Standard for Robot Exclusion",
|
||||
http://info.webcrawler.com/mak/projects/robots/norobots.html,
|
||||
June 1994.
|
||||
|
||||
[2] Berners-Lee, T., Fielding, R., and Frystyk, H., "Hypertext
|
||||
Transfer Protocol -- HTTP/1.0." RFC 1945, MIT/LCS, May 1996.
|
||||
|
||||
[3] Postel, J., "Media Type Registration Procedure." RFC 1590,
|
||||
USC/ISI, March 1994.
|
||||
|
||||
[4] Berners-Lee, T., Masinter, L., and M. McCahill, "Uniform
|
||||
Resource Locators (URL)", RFC 1738, CERN, Xerox PARC,
|
||||
University of Minnesota, December 1994.
|
||||
|
||||
[5] Crocker, D., "Standard for the Format of ARPA Internet Text
|
||||
Messages", STD 11, RFC 822, UDEL, August 1982.
|
||||
|
||||
[6] Fielding, R., "Relative Uniform Resource Locators", RFC 1808,
|
||||
UC Irvine, June 1995.
|
||||
|
||||
9. Author's Address
|
||||
|
||||
Martijn Koster
|
||||
WebCrawler
|
||||
America Online
|
||||
690 Fifth Street
|
||||
San Francisco
|
||||
CA 94107
|
||||
|
||||
Phone: 415-3565431
|
||||
EMail: m.koster@webcrawler.com
|
||||
|
||||
Expires June 4, 1997
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Koster draft-koster-robots-00.txt [Page 11]
|
||||
</pre>
|
||||
<hr>
|
||||
<div align=right>
|
||||
<address>
|
||||
<small>
|
||||
<A href="http://info.webcrawler.com/mak/projects/robots/robots.html">The
|
||||
Web Robots Pages</A>
|
||||
</small>
|
||||
</address>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
46
test/robots.txt
Normal file
46
test/robots.txt
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
# /robots.txt for http://www.musi-cal.com/
|
||||
# See http://info.webcrawler.com/mak/projects/robots/norobots.html
|
||||
# Skip Montanaro (skip@mojam.com)
|
||||
# - adapted from the robots.txt file at http://web.nexor.co.uk/
|
||||
|
||||
# by default
|
||||
User-agent: *
|
||||
Disallow: /ccrd # not useful to spiders
|
||||
Disallow: /click # not useful to spiders
|
||||
Disallow: /search # dynamic
|
||||
Disallow: /hc # dynamic
|
||||
Disallow: /subbatch # dynamic
|
||||
Disallow: /vadd # dynamic
|
||||
Disallow: /vsearch # dynamic
|
||||
Disallow: /vedit # dynamic
|
||||
Disallow: /vdelete # dynamic
|
||||
Disallow: /cgi-bin # dynamic
|
||||
Disallow: /images/ # useless images
|
||||
Disallow: /icons/ # useless images
|
||||
Disallow: /concerts/ # deprecated URL form
|
||||
Disallow: /conferences # defunct
|
||||
Disallow: /musician # defunct
|
||||
Disallow: /~skip/volkswagen # defunct
|
||||
Disallow: /%7Eskip/volkswagen # defunct
|
||||
|
||||
# disallow a bunch of ill-behaved user agents (doubt this will deter them...)
|
||||
User-agent: ExtractorPro
|
||||
Disallow: /
|
||||
User-agent: EmailSiphon
|
||||
Disallow: /
|
||||
User-agent: EmailWolf
|
||||
Disallow: /
|
||||
User-agent: CherryPickerSE/1.0
|
||||
Disallow: /
|
||||
User-agent: CherryPickerElite/1.0
|
||||
Disallow: /
|
||||
User-agent: EmailCollector/1.0
|
||||
Disallow: /
|
||||
User-agent: EmailWolf 1.00
|
||||
Disallow: /
|
||||
User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0
|
||||
Disallow: /
|
||||
User-agent: EmailSiphon
|
||||
Disallow: /
|
||||
User-agent: Mozilla/2.0 (compatible; NEWT ActiveX; Win32)
|
||||
Disallow: /
|
||||
Loading…
Reference in a new issue