updated for python2.3

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@995 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2003-08-11 11:19:38 +00:00
parent f39f25d950
commit 648df944b0
15 changed files with 27 additions and 372 deletions

View file

@ -1,6 +1,7 @@
1.8.23
* merged robotparser.py with newest official version
Changed: linkcheck/robotparser.py
1.9.23
* Require Python 2.3
- removed timeoutsocket.py and robotparser.py
- use True/False values
Closes: SF bug 784977
* start CGI output immediately
Changed: lc.cgi, lc.fcgi, lc.sz_fcgi, linkcheck/lc_cgi.py

View file

@ -3,7 +3,7 @@
Requirements
------------
Python >= 2.2.1 from http://www.python.org/
Python >= 2.3 from http://www.python.org/
For HTTPS support you need to compile Python with the SSL _socket
module.

View file

@ -1,5 +1,5 @@
# This Makefile is only used by developers.
PYTHON=python2.2
PYTHON=python2.3
VERSION=$(shell $(PYTHON) setup.py --version)
PACKAGE=linkchecker
NAME=$(shell $(PYTHON) setup.py --name)
@ -46,11 +46,11 @@ dist: locale config
md5sum dist/* > $(MD5SUMS)
for f in dist/*; do gpg --detach-sign --armor $$f; done
# to build in the current directory (assumes python 2.2)
# to build in the current directory (assumes python 2.3)
localbuild:
$(MAKE) -C linkcheck/parser
$(PYTHON) setup.py build
cp -f build/lib.linux-i686-2.2/linkcheck/parser/htmlsax.so linkcheck/parser
cp -f build/lib.linux-i686-2.3/linkcheck/parser/htmlsax.so linkcheck/parser
# produce the .deb Debian package

2
TODO
View file

@ -4,12 +4,10 @@
- the HTML parser should be even more forgiving with badly formatted html
possible Python 2.3 improvements (ie needs Python >= 2.3)
- get rid of timeoutsocket.py, the default socket has timeouts
- use optparse instead of getopt with more flexible commandline help
- replace the debug() function with the logging module
we'll see how we can insert multiple debug levels into this thing
- use Bool object type
- get rid of the patched robotparser.py
- use new csv module
- use the Set type instead of hashmaps (did I use hashmaps for sets here?)

2
debian/changelog vendored
View file

@ -1,4 +1,4 @@
linkchecker (1.8.23-1) unstable; urgency=low
linkchecker (1.9.0-1) unstable; urgency=low
* New upstream release.

4
debian/control vendored
View file

@ -2,12 +2,12 @@ Source: linkchecker
Section: web
Priority: optional
Maintainer: Bastian Kleineidam <calvin@debian.org>
Build-Depends: python2.2-dev (>= 2.2.1), debhelper (>= 4.1.0), gettext
Build-Depends: python-dev (>= 2.3), python-dev (<< 2.4), debhelper (>= 4.1.0), gettext
Standards-Version: 3.6.0
Package: linkchecker
Architecture: any
Depends: python2.2 (>= 2.2.1)
Depends: ${python:Depends}
Conflicts: linkchecker-ssl
Suggests: httpd
Description: check HTML documents for broken links

View file

@ -1,34 +0,0 @@
#!/bin/sh
#
# Written 1998 by Gregor Hoffleit <flight@debian.org>.
# used by Bastian Kleineidam for LinkChecker
set -e
PYVER=2.2
PYTHON=python
PYTHONXY=${PYTHON}${PYVER}
SITEPACKAGES=/usr/lib/$PYTHONXY/site-packages
DIRLIST=linkcheck
FILELIST=_linkchecker_configdata.py
COMMAND="'import sys,py_compile;py_compile.compile(sys.argv[1])'"
case "$1" in
configure|abort-upgrade|abort-remove|abort-deconfigure)
for i in $DIRLIST; do
$PYTHONXY -O /usr/lib/$PYTHONXY/compileall.py -q $SITEPACKAGES/$i
$PYTHONXY /usr/lib/$PYTHONXY/compileall.py -q $SITEPACKAGES/$i
done
# use /bin/sh -c, otherwise I get a SyntaxError from Python
for i in $FILELIST; do
/bin/sh -c "$PYTHONXY -O -c $COMMAND $SITEPACKAGES/$i"
/bin/sh -c "$PYTHONXY -c $COMMAND $SITEPACKAGES/$i"
done
;;
*)
echo "postinst called with unknown argument \`$1'" >&2
exit 1
;;
esac
#DEBHELPER#
exit 0

View file

@ -1,19 +0,0 @@
#!/bin/sh
#
# Written 1998 by Gregor Hoffleit <flight@debian.org>.
# used by Bastian Kleineidam for LinkChecker
set -e
PACKAGE=linkchecker
PYVER=2.2
PYTHON=python
PYTHONXY=${PYTHON}${PYVER}
#DEBHELPER#
dpkg --listfiles $PACKAGE |
awk '$0~/\.py$/ {print $0"c\n" $0"o"}' |
xargs rm -f >&2
rmdir /usr/lib/$PYTHONXY/site-packages/linkcheck 2>/dev/null || true
exit 0

3
debian/rules vendored
View file

@ -4,7 +4,7 @@
PACKAGE=linkchecker
ROOT=$(CURDIR)/debian/$(PACKAGE)
DOCDIR = $(ROOT)/usr/share/doc/$(PACKAGE)
PYTHON=python2.2
PYTHON=python
CGIDIR=$(ROOT)/usr/lib/cgi-bin/lconline
HTMLDIR=$(ROOT)/var/www/lconline
@ -66,6 +66,7 @@ binary-arch: build install
dh_installchangelogs
dh_link
dh_strip
dh_python
dh_compress
dh_fixperms
dh_installdeb

View file

@ -29,7 +29,7 @@ AnsiType = {
'invert': '7',
}
# color numbers
# color numbers (the capitalized colors are bright)
AnsiColor = {
'default': '0',
'black': '30',
@ -72,6 +72,7 @@ def esc_ansicolor (color):
AnsiReset = esc_ansicolor("default")
def colorize (text, color=None):
"return text colorized if TERM is set"
if (color is not None) and os.environ.get('TERM'):

View file

@ -16,10 +16,9 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import urlparse, sys, time, re, httplib
import urlparse, sys, time, re, httplib, robotparser
from urllib import quote, unquote
import Config, StringUtil, i18n
from linkcheck import robotparser
from debug import *
# XXX not dynamic
if get_debuglevel() > 0:

View file

@ -1,7 +1,7 @@
# use beta flex from http://lex.sf.net/ for reentrant
# bison parser support
FLEX=flex
PYTHON=python2.2
PYTHON=python2.3
all: htmllex.c htmlparse.c

View file

@ -1,298 +0,0 @@
# -*- coding: iso-8859-1 -*-
""" robotparser.py
Copyright (C) 2000 Bastian Kleineidam
You can choose between two licenses when using this package:
1) GNU GPLv2
2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
"""
import urlparse, urllib
__all__ = ["RobotFileParser"]
debug = 0
def _debug(msg):
if debug: print msg
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
"""
def __init__(self, url=''):
self.entries = []
self.default_entry = None
self.disallow_all = 0
self.allow_all = 0
self.set_url(url)
self.last_checked = 0
def mtime(self):
"""Returns the time the robots.txt file was last fetched.
This is useful for long-running web spiders that need to
check for new robots.txt files periodically.
"""
return self.last_checked
def modified(self):
"""Sets the time the robots.txt file was last fetched to the
current time.
"""
import time
self.last_checked = time.time()
def set_url(self, url):
"""Sets the URL referring to a robots.txt file."""
self.url = url
self.host, self.path = urlparse.urlparse(url)[1:3]
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
opener = URLopener()
f = opener.open(self.url)
lines = []
line = f.readline()
while line:
lines.append(line.strip())
line = f.readline()
self.errcode = opener.errcode
if self.errcode == 401 or self.errcode == 403:
self.disallow_all = 1
_debug("disallow all")
elif self.errcode >= 400:
self.allow_all = 1
_debug("allow all")
elif self.errcode == 200 and lines:
_debug("parse lines")
self.parse(lines)
def _add_entry(self, entry):
if "*" in entry.useragents:
# the default entry is considered last
self.default_entry = entry
else:
self.entries.append(entry)
def parse(self, lines):
"""parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines."""
state = 0
linenumber = 0
entry = Entry()
for line in lines:
linenumber = linenumber + 1
if not line:
if state==1:
_debug("line %d: warning: you should insert"
" allow: or disallow: directives below any"
" user-agent: line" % linenumber)
entry = Entry()
state = 0
elif state==2:
self._add_entry(entry)
entry = Entry()
state = 0
# remove optional comment and strip line
i = line.find('#')
if i>=0:
line = line[:i]
line = line.strip()
if not line:
continue
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
line[1] = urllib.unquote(line[1].strip())
if line[0] == "user-agent":
if state==2:
_debug("line %d: warning: you should insert a blank"
" line before any user-agent"
" directive" % linenumber)
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
elif line[0] == "disallow":
if state==0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
entry.rulelines.append(RuleLine(line[1], 0))
state = 2
elif line[0] == "allow":
if state==0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
entry.rulelines.append(RuleLine(line[1], 1))
else:
_debug("line %d: warning: unknown key %s" % (linenumber,
line[0]))
else:
_debug("line %d: error: malformed line %s"%(linenumber, line))
if state==2:
self.entries.append(entry)
_debug("Parsed rules:\n%s" % str(self))
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
_debug("Checking robot.txt allowance for:\n user agent: %s\n url: %s" %
(useragent, url))
if self.disallow_all:
return 0
if self.allow_all:
return 1
# search for given user agent matches
# the first match counts
url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
# try the default entry last
if self.default_entry:
return self.default_entry.allowance(url)
# agent not found ==> access granted
return 1
def __str__(self):
ret = ""
for entry in self.entries:
ret = ret + str(entry) + "\n"
return ret
class RuleLine:
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path."""
def __init__(self, path, allowance):
if path == '' and not allowance:
# an empty value means allow all
allowance = 1
self.path = urllib.quote(path)
self.allowance = allowance
def applies_to(self, filename):
return self.path=="*" or filename.startswith(self.path)
def __str__(self):
return (self.allowance and "Allow" or "Disallow")+": "+self.path
class Entry:
"""An entry has one or more user-agents and zero or more rulelines"""
def __init__(self):
self.useragents = []
self.rulelines = []
def __str__(self):
ret = ""
for agent in self.useragents:
ret = ret + "User-agent: "+agent+"\n"
for line in self.rulelines:
ret = ret + str(line) + "\n"
return ret
def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""
# split the name token and make it lower case
useragent = useragent.split("/")[0].lower()
for agent in self.useragents:
if agent=='*':
# we have the catch-all agent
return 1
agent = agent.lower()
if useragent.find(agent) != -1:
return 1
return 0
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL decoded"""
for line in self.rulelines:
_debug((filename, str(line), line.allowance))
if line.applies_to(filename):
return line.allowance
return 1
class URLopener(urllib.FancyURLopener):
def __init__(self, *args):
apply(urllib.FancyURLopener.__init__, (self,) + args)
self.errcode = 200
def http_error_default(self, url, fp, errcode, errmsg, headers):
self.errcode = errcode
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
errmsg, headers)
def prompt_user_passwd (self, host, realm):
"""Do not ask interactively any password; return None, None"""
return None, None
def _check(a,b):
if not b:
ac = "access denied"
else:
ac = "access allowed"
if a!=b:
print "failed"
else:
print "ok (%s)" % ac
print
def _test():
global debug
rp = RobotFileParser()
debug = 1
# robots.txt that exists, gotten to by redirection
rp.set_url('http://www.musi-cal.com/robots.txt')
rp.read()
# test for re.escape
_check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
# this should match the first rule, which is a disallow
_check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
# various cherry pickers
_check(rp.can_fetch('CherryPickerSE',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
_check(rp.can_fetch('CherryPickerSE/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
_check(rp.can_fetch('CherryPickerSE/1.5',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
# case sensitivity
_check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
_check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
# substring test
_check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
# tests for catch-all * agent
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
# robots.txt that does not exist
rp.set_url('http://www.lycos.com/robots.txt')
rp.read()
_check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
if __name__ == '__main__':
_test()

View file

@ -19,8 +19,8 @@
# imports and checks
import sys
if not hasattr(sys, 'version_info') or sys.version_info<(2, 2, 1, 'final', 0):
raise SystemExit, "This program requires Python 2.2.1 or later."
if not hasattr(sys, 'version_info') or sys.version_info<(2, 3, 0, 'final', 0):
raise SystemExit, "This program requires Python 2.3 or later."
import getopt, re, os, pprint, linkcheck
import linkcheck.timeoutsocket

View file

@ -146,7 +146,7 @@ myname = "Bastian Kleineidam"
myemail = "calvin@users.sourceforge.net"
setup (name = "linkchecker",
version = "1.8.23",
version = "1.9.0",
description = "check HTML documents for broken links",
author = myname,
author_email = myemail,
@ -199,5 +199,11 @@ o a (Fast)CGI web interface (requires HTTP server)
'lc.cgi','lc.fcgi','lc.sz_fcgi','linkchecker.bat']),
('share/man/man1', ['linkchecker.1']),
],
#classifiers = ['Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking',]
classifiers = [
'Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking',
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Programming Language :: Python',
'Programming Language :: C',
],
)