regression test suite

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@229 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2001-02-18 20:29:52 +00:00
parent 83a0846fef
commit 5d1c6ef00e
24 changed files with 609 additions and 306 deletions

View file

@ -6,6 +6,7 @@ dist
foo
MANIFEST
VERSION
LinkCheckerConf.py
linkcheckerConf.py
js
locale
Packages.gz

View file

@ -68,11 +68,7 @@ uploadpull: distclean dist package files VERSION
ssh -C -t shell1.sourceforge.net "cd /home/groups/$(PACKAGE) && make pull"
test:
rm -f test/*.result
@for i in test/*.html; do \
echo "Testing $$i. Results are in $$i.result"; \
./$(PACKAGE) -r1 -ucalvin -pcalvin -otext -N"news.rz.uni-sb.de" -v -a $$i > $$i.result 2>&1; \
done
python2 test/regrtest.py
locale:
$(MAKE) -C po

5
debian/changelog vendored
View file

@ -1,9 +1,10 @@
linkchecker (1.3.0) unstable; urgency=low
* require Python 2.0
* require and use Python >= 2.0
* fix agent matching in robotparser2.py
* added more LinkPatterns (ripped from HTML::Tagset.pm)
-- Bastian Kleineidam <calvin@users.sourceforge.net> Thu, 1 Feb 2001 01:51:27 +0100
-- Bastian Kleineidam <calvin@users.sourceforge.net> Fri, 9 Feb 2001 10:51:24 +0100
linkchecker (1.2.14) unstable; urgency=low

2
debian/control vendored
View file

@ -2,7 +2,7 @@ Source: linkchecker
Section: web
Priority: optional
Maintainer: Bastian Kleineidam <calvin@users.sourceforge.net>
Build-Depends: python2-base (>= 2.0), python2-base (<= 2.0), python2-dev (>= 1.5.2), python2-dev (<= 2.0), debhelper
Build-Depends: python2-base (>= 2.0), python2-base (<= 2.0), python2-dev (>= 1.5.2), python2-dev (<= 2.0), debhelper (>= 3.0.0)
Build-Depends-Indep: gettext
Standards-Version: 3.2.1

5
debian/rules vendored
View file

@ -60,7 +60,6 @@ binary-indep: build install
# Build architecture-dependent files here.
binary-arch: build install
# dh_testversion
dh_testdir
dh_testroot
# dh_installdebconf
@ -71,7 +70,7 @@ binary-arch: build install
# dh_installpam
# dh_installinit
# dh_installcron
dh_installmanpages
dh_installman linkchecker.1
# dh_installinfo
# dh_undocumented linkchecker.1
dh_installchangelogs
@ -79,8 +78,6 @@ binary-arch: build install
dh_strip
dh_compress
dh_fixperms
# You may want to make some executables suid here.
# dh_suidregister
# dh_makeshlibs
dh_installdeb
# dh_perl

View file

@ -54,6 +54,7 @@ Loggers = {
"csv": Logging.CSVLogger,
"blacklist": Logging.BlacklistLogger,
"xml": Logging.XMLLogger,
"test": Logging.TestLogger,
}
# for easy printing: a comma separated logger list
LoggerKeys = reduce(lambda x, y: x+", "+y, Loggers.keys())
@ -102,9 +103,10 @@ class Configuration(UserDict.UserDict):
'joe@')]
self["proxy"] = getproxies()
self["recursionlevel"] = 1
self["robotstxt"] = 0
self["robotstxt"] = 1
self["strict"] = 0
self["fileoutput"] = []
self["loggingfields"] = "all"
# Logger configurations
self["text"] = {
"filename": "linkchecker-out.txt",
@ -152,6 +154,7 @@ class Configuration(UserDict.UserDict):
self['xml'] = {
"filename": "linkchecker-out.xml",
}
self['test'] = {} # no args for test logger
# default values
self['log'] = self.newLogger('text')
self["quiet"] = 0
@ -412,8 +415,9 @@ class Configuration(UserDict.UserDict):
try: self["warnings"] = cfgparser.getboolean(section, "warnings")
except ConfigParser.Error: pass
try:
filelist = string.split(cfgparser.get(section, "fileoutput"))
filelist = string.split(cfgparser.get(section, "fileoutput"), ",")
for arg in filelist:
arg = string.strip(arg)
# no file output for the blacklist Logger
if Loggers.has_key(arg) and arg != "blacklist":
self['fileoutput'].append(
@ -424,6 +428,10 @@ class Configuration(UserDict.UserDict):
for opt in cfgparser.options(key):
try: self[key][opt] = cfgparser.get(key, opt)
except ConfigParser.Error: pass
try:
self['loggingfields'] = map(string.strip, string.split(
cfgparser.get(section, 'loggingfields'), ","))
except ConfigParser.Error: pass
section="checking"
try:

View file

@ -44,22 +44,22 @@ import Config, StringUtil
import linkcheck
_ = linkcheck._
# keywords
KeyWords = ["Real URL",
"Result",
"Base",
"Name",
"Parent URL",
"Info",
"Warning",
"D/L Time",
"Check Time",
"URL",
]
MaxIndent = max(map(lambda x: len(_(x)), KeyWords))+1
LogFields = {
"realurl": "Real URL",
"result": "Result",
"base": "Base",
"name": "Name",
"parenturl": "Parent URL",
"info": "Info",
"warning": "Warning",
"downloadtime": "D/L Time",
"checktime": "Check Time",
"url": "URL",
}
MaxIndent = max(map(lambda x: len(_(x)), LogFields.values()))+1
Spaces = {}
for key in KeyWords:
Spaces[key] = " "*(MaxIndent - len(_(key)))
for key,value in LogFields.items():
Spaces[key] = " "*(MaxIndent - len(_(value)))
EntityTable = {
'<': '&lt;',
@ -99,7 +99,13 @@ class StandardLogger:
self.fd = args['fd']
else:
self.fd = sys.stdout
self.logfields = None # all fields
if args.has_key('logfields'):
if type(args['logfields']) == ListType:
self.logfields = args
def logfield(self, name):
return self.logfields and name in self.logfields
def init(self):
self.starttime = time.time()
@ -111,14 +117,15 @@ class StandardLogger:
def newUrl(self, urlData):
self.fd.write("\n"+_("URL")+Spaces["URL"]+urlData.urlName)
if urlData.cached:
self.fd.write(_(" (cached)\n"))
else:
self.fd.write("\n")
if urlData.name:
self.fd.write(_("Name")+Spaces["Name"]+urlData.name+"\n")
if urlData.parentName:
if self.logfield('url'):
self.fd.write("\n"+_(LogFields['url'])+Spaces['url']+urlData.urlName)
if urlData.cached:
self.fd.write(_(" (cached)\n"))
else:
self.fd.write("\n")
if urlData.name and self.logfield('name'):
self.fd.write(_(LogFields["name"])+Spaces["name"]+urlData.name+"\n")
if urlData.parentName and self.logfield('parentname'):
self.fd.write(_("Parent URL")+Spaces["Parent URL"]+
urlData.parentName+_(", line ")+
str(urlData.line)+"\n")
@ -739,3 +746,33 @@ class CSVLogger(StandardLogger):
self.fd.flush()
self.fd = None
class TestLogger:
""" Output for regression test """
def __init__(self, **args):
pass
def init(self):
pass
def newUrl(self, urlData):
print 'url',urlData.urlName
if urlData.cached:
print "cached"
if urlData.name:
print "name",urlData.name
if urlData.parentName:
print "parenturl",urlData.parentName
print "line",urlData.line
if urlData.baseRef:
print "baseurl",urlData.baseRef
if urlData.infoString:
print "info",urlData.infoString
if urlData.warningString:
print "warning",urlData.warningString
if urlData.valid:
print "valid",urlData.validString
else:
print "error",urlData.errorString
def endOfOutput(self, linknumber=-1):
pass

View file

@ -137,7 +137,7 @@ class RobotFileParser:
return 1
# search for given user agent matches
# the first match counts
url = urllib.quote(urlparse.urlparse(url)[2])
url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
@ -222,6 +222,8 @@ def _test():
rp.parse(open(sys.argv[1]).readlines())
# test for re.escape
_check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
# empty url path
_check(rp.can_fetch('*', 'http://www.musi-cal.com'), 1)
# this should match the first rule, which is a disallow
_check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
# various cherry pickers

View file

@ -27,102 +27,106 @@ import linkcheck
from linkcheck import _,StringUtil
Usage = _("USAGE\tlinkchecker [options] file-or-url...\n"
"\n"
"OPTIONS\n"
"For single-letter option arguments the space is not a necessity. So\n"
"'-o colored' is the same as '-ocolored'.\n"
"-a, --anchors\n"
" Check anchor references. Default is don't check anchors.\n"
"-d, --denyallow\n"
" Swap checking order to extern/intern. Default checking order\n"
" is intern/extern.\n"
"-D, --debug\n"
" Print additional debugging information.\n"
"-e regex, --extern=regex\n"
" Assume urls that match the given expression as extern.\n"
" Only intern HTML links are checked recursively.\n"
"-f file, --config=file\n"
" Use file as configuration file. LinkChecker first searches\n"
" ~/.linkcheckerrc and then /etc/linkcheckerrc\n"
" (under Windows <path-to-program>\\linkcheckerrc).\n"
"-F type, --file-output=type\n"
" Same as output, but write to a file linkchecker-out.<type>.\n"
" If the file already exists, it is overwritten. You can specify\n"
" this option more than once. There is no file output for the\n"
" blacklist logger. Default is no file output.\n"
"-i regex, --intern=regex\n"
" Assume URLs that match the given expression as intern.\n"
" LinkChecker descends recursively only to intern URLs, not to extern.\n"
"-h, --help\n"
" Help me! Print usage information for this program.\n"
"-N server, --nntp-server=server\n"
" Specify an NNTP server for 'news:...' links. Default is the\n"
" environment variable NNTP_SERVER. If no host is given,\n"
" only the syntax of the link is checked.\n"
"-o type, --output=type\n"
" Specify output type as %s.\n"
" Default type is text.\n"
"-p pwd, --password=pwd\n"
" Try password pwd for HTML and FTP authorization.\n"
" Default password is 'joe@'. See also -u.\n"
"-q, --quiet\n"
" Quiet operation. This is only useful with -F.\n"
"-r depth, --recursion-level=depth\n"
" Check recursively all links up to given depth (depth >= 0).\n"
" Default depth is 1.\n"
"-R, --robots-txt\n"
" Obey the robots exclusion standard.\n"
"-s, --strict\n"
" Check only syntax of extern links, do not try to connect to them.\n"
"-t num, --threads=num\n"
" Generate no more than num threads. Default number of threads is 5.\n"
" To disable threading specify a non-positive number.\n"
"-u name, --user=name\n"
" Try username name for HTML and FTP authorization.\n"
" Default is 'anonymous'. See also -p.\n"
"-V, --version\n"
" Print version and exit.\n"
"-v, --verbose\n"
" Log all checked URLs (implies -w). Default is to log only invalid\n"
" URLs.\n"
"-w, --warnings\n"
" Log warnings.\n"
"-W regex, --warning-regex=regex\n"
" Define a regular expression which prints a warning if it matches\n"
" any content of the checked link.\n"
" This applies of course only to pages which are valid, so we can\n"
" get their content.\n"
" Use this to check for pages that contain some form of error\n"
" message, for example 'This page has moved' or 'Oracle\n"
" Application Server error'.\n"
" This option implies -w.\n") % linkcheck.Config.LoggerKeys
Usage = _("""USAGE\tlinkchecker [options] file-or-url...
Notes = _("NOTES\n"
"o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL\n"
" starts with 'www.' resp. 'ftp.'\n"
" You can also give local files as arguments.\n"
"o If you have your system configured to automatically establish a\n"
" connection to the internet (e.g. with diald), it will connect when\n"
" checking links not pointing to your local host.\n"
" Use the -s and -i options to prevent this.\n"
"o Javascript links are currently ignored.\n"
"o If your platform does not support threading, LinkChecker uses -t0.\n"
"o You can supply multiple user/password pairs in a configuration file.\n"
"o Cookies are not accepted by LinkChecker.\n"
"o To use proxies set $http_proxy, $https_proxy on Unix or Windows.\n"
" On a Mac use the Internet Config.\n"
"o When checking 'news:' links the given NNTP host doesn't need to be the\n"
" same as the host of the user browsing your pages!\n")
OPTIONS
For single-letter option arguments the space is not a necessity. So
'-o colored' is the same as '-ocolored'.
-a, --anchors
Check anchor references. Default is don't check anchors.
-d, --denyallow
Swap checking order to extern/intern. Default checking order
is intern/extern.
-D, --debug
Print additional debugging information.
-e regex, --extern=regex
Assume urls that match the given expression as extern.
Only intern HTML links are checked recursively.
-f file, --config=file
Use file as configuration file. LinkChecker first searches
~/.linkcheckerrc and then /etc/linkcheckerrc
(under Windows <path-to-program>\\linkcheckerrc).
-F type[/filename], --file-output=type[/filename]
Same as output, but write to a file linkchecker-out.<type>
or <filename> if specified. If the file already exists, it
is overwritten. You can specify this option more than once.
There is no file output for the blacklist logger. Default is
no file output.
-i regex, --intern=regex
Assume URLs that match the given expression as intern.
LinkChecker descends recursively only to intern URLs, not to extern.
-h, --help
Help me! Print usage information for this program.
-N server, --nntp-server=server
Specify an NNTP server for 'news:...' links. Default is the
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked.
-o type, --output=type
Specify output type as %s.
Default type is text.
-p pwd, --password=pwd
Try password pwd for HTML and FTP authorization.
Default password is 'joe@'. See also -u.
-q, --quiet
Quiet operation. This is only useful with -F.
-r depth, --recursion-level=depth
Check recursively all links up to given depth (depth >= 0).
Default depth is 1.
-R, --robots-txt
Obey the robots exclusion standard.
-s, --strict
Check only syntax of extern links, do not try to connect to them.
-t num, --threads=num
Generate no more than num threads. Default number of threads is 5.
To disable threading specify a non-positive number.
-u name, --user=name
Try username name for HTML and FTP authorization.
Default is 'anonymous'. See also -p.
-V, --version
Print version and exit.
-v, --verbose
Log all checked URLs (implies -w). Default is to log only invalid
URLs.
-w, --warnings
Log warnings.
-W regex, --warning-regex=regex
Define a regular expression which prints a warning if it matches
any content of the checked link.
This applies of course only to pages which are valid, so we can
get their content.
Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application Server error'.
This option implies -w.\n") % linkcheck.Config.LoggerKeys
"""
Examples = _("EXAMPLES\n"
"o linkchecker -v -ohtml -r2 -s -itreasure.calvinsplayground.de \\\n"
" http://treasure.calvinsplayground.de/~calvin/ > sample.html\n"
"o Local files and syntactic sugar on the command line:\n"
" linkchecker c:\\temp\\test.html\n"
" linkchecker ../bla.html\n"
" linkchecker www.myhomepage.de\n"
" linkchecker -r0 ftp.linux.org\n")
Notes = _("""NOTES
o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL
starts with 'www.' resp. 'ftp.'
You can also give local files as arguments.
o If you have your system configured to automatically establish a
connection to the internet (e.g. with diald), it will connect when
checking links not pointing to your local host.
Use the -s and -i options to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker uses -t0.
o You can supply multiple user/password pairs in a configuration file.
o Cookies are not accepted by LinkChecker.
o To use proxies set $http_proxy, $https_proxy on Unix or Windows.
On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
same as the host of the user browsing your pages!
""")
Examples = _("""EXAMPLES
o linkchecker -v -ohtml -r2 -s -itreasure.calvinsplayground.de \\
http://treasure.calvinsplayground.de/~calvin/ > sample.html
o Local files and syntactic sugar on the command line:
linkchecker c:\\temp\\test.html
linkchecker ../bla.html
linkchecker www.myhomepage.de
linkchecker -r0 ftp.linux.org
""")
def printVersion():
print linkcheck.Config.AppInfo
@ -207,9 +211,13 @@ for opt,arg in options:
"'-o, --output'")
elif opt=="-F" or opt=="--file-output":
if linkcheck.Config.Loggers.has_key(arg) and arg != "blacklist":
config['fileoutput'].append(
config.newLogger(arg, {'fileoutput':1}))
ns = {'fileoutput':1}
try:
type, ns['filename'] = string.split(arg, '/', 1)
if not ns['filename']: raise ValueError
except ValueError: type = arg
if linkcheck.Config.Loggers.has_key(type) and type != "blacklist":
config['fileoutput'].append(config.newLogger(type, ns))
else:
printUsage((_("Illegal argument '%s' for option ") % arg) +\
"'-F, --file-output'")

View file

@ -29,84 +29,85 @@ a (Fast)CGI web interface (requires HTTP server)
For single-letter option arguments the space is not a necessity.
So \fI-o colored\fP is the same as \fI-ocolored\fP.
.TP
\fB-a, --anchors\fP
\fB-a\fP, \fB--anchors\fP
Check anchor references. Default is don't check anchors.
.TP
\fB-d, --denyallow\fP
\fB-d\fP, \fB--denyallow\fP
Swap checking order to extern/intern. Default checking order is
intern/extern.
.TP
\fB-D, --debug\fP
\fB-D\fP, \fB--debug\fP
Print debugging information.
.TP
\fB-e \fIregex\fB, --extern=\fIregex\fP
\fB-e \fIregex\fP, \fB--extern=\fIregex\fP
Assume urls that match the given regular expression as extern.
Only intern HTML links are checked recursively.
.TP
\fB-f \fIfile\fB, --config=\fIfile\fP
\fB-f \fIfile\fP, \fB--config=\fIfile\fP
Use \fIfile\fP as configuration file. LinkChecker first searches for
~/.linkcheckerrc and then /etc/linkcheckerrc on Unix systems.
Under Windows systems we read <path-to-program>\\linkcheckerrc.
.TP
\fB-F \fItype\fB, --file-output=\fItype\fP
Same as output, but write to a file \fIlinkchecker-out.<type>\fP.
If the file already exists, it is overwritten. You can specify this
option more than once. There is no file output for the blacklist
logger. Default is no file output.
\fB-F \fItype\fP[\fI/filename\fP], \fB--file-output=\fItype\fP[\fI/filename\fP]
Same as output, but write to a file \fIlinkchecker-out.<type>\fP
or \fIfilename\fP if specified. If the file already exists, it is
overwritten. You can specify this option more than once. There
is no file output for the blacklist logger. Default is no file
output.
.TP
\fB-i \fIregex\fB, --intern=\fIregex\fP
\fB-i \fIregex\fP, \fB--intern=\fIregex\fP
Assume URLs that match the given regular expression as intern.
LinkChecker descends recursively only to intern URLs, not to extern.
.TP
\fB-h, --help\fP
\fB-h\fP, \fB--help\fP
Help me! Print usage information for this program.
.TP
\fB-N \fIserver\fB, --nntp-server=\fIserver\fP
\fB-N \fIserver\fP, \fB--nntp-server=\fIserver\fP
Specify an NNTP server for 'news:...' links. Default is the
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked.
.TP
\fB-o \fItype\fB, --output=\fItype\fP
\fB-o \fItype\fP, \fB--output=\fItype\fP
Specify output type as \fItext\fP, \fIcolored\fP, \fIhtml\fP, \fIsql\fP,
\fIcsv\fP, \fIgml\fP, \fIxml\fP or \fIblacklist\fP.
Default type is \fItext\fP.
.TP
\fB-p \fIpwd\fB, --password=\fIpwd\fP
\fB-p \fIpwd\fP, \fB--password=\fIpwd\fP
Try the password \fIpwd\fB for HTML and FTP authorization.
The default password is \fIguest@\fP. See also \fB-u\fP.
.TP
\fB-q, --quiet\fP
\fB-q\fP, \fB--quiet\fP
Quiet operation. This is only useful with \fB-F\fP.
.TP
\fB-r \fIdepth\fB, --recursion-level=\fIdepth\fP
\fB-r \fIdepth\fP, \fB--recursion-level=\fIdepth\fP
Check recursively all links up to given \fIdepth\fP (depth >= 0).
Default depth is 1.
.TP
\fB-R, --robots-txt\fP
\fB-R\fP, \fB--robots-txt\fP
Obey the robots exclusion standard.
.TP
\fB-s, --strict\fP
\fB-s\fP, \fB--strict\fP
Check only the syntax of extern links, do not try to connect to them.
.TP
\fB-t \fInum\fB, --threads=\fInum\fP
\fB-t \fInum\fP, \fB--threads=\fInum\fP
Generate no more than \fInum\fP threads. Default number of threads is 5.
To disable threading specify a non-positive number.
.TP
\fB-u \fIname\fB, --user=\fIname\fP
\fB-u \fIname\fP, \fB--user=\fIname\fP
Try username \fIname\fP for HTML and FTP authorization.
Default is \fIanonymous\fP. See also \fB-p\fP.
.TP
\fB-V, --version\fP
\fB-V\fP, \fB--version\fP
Print version and exit.
.TP
\fB-v, --verbose\fP
\fB-v\fP, \fB--verbose\fP
Log all checked URLs (implies \fB-w\fP). Default is to log only invalid
URLs.
.TP
\fB-w, --warnings\fP
\fB-w\fP, \fB--warnings\fP
Log warnings.
.TP
\fB-W \fIregex\fB, --warning-regex=\fIregex\fP
\fB-W \fIregex\fP, \fB--warning-regex=\fIregex\fP
Define a regular expression which prints a warning if it matches any
content of the checked link.
This applies of course only to pages which are valid, so we can get

View file

@ -1,91 +1,98 @@
# sample resource file
# to modify, just uncomment the line
# sample resource file with default values
# see linkchecker -h for help on these options
# commandline options override these settings!
[output]
# turn on/off debug messages
#debug=0
debug=0
# use the color logger
#log=colored
log=text
# turn on/off --verbose
#verbose=0
verbose=0
# turn on/off --warnings
#warnings=0
warnings=0
# turn on/off --quiet
#quiet=0
quiet=0
# additional file output
#fileoutput = text colored html gml sql
fileoutput=
#fileoutput = text, colored, html, gml, sql XXX
# what fields should each logger print out?
fields = all
# field = url, parent url, base url
# fields names: XXX
# url
# parent url
# base url
# each Logger can have separate configuration parameters
# standard text logger
[text]
#filename=linkchecker-out.txt
filename=linkchecker-out.txt
# GML logger
[gml]
#filename=linkchecker-out.gml
filename=linkchecker-out.gml
# CSV logger
[csv]
#filename=linkchecker-out.csv
#separator=;
filename=linkchecker-out.csv
separator=;
# SQL logger
[sql]
#filename=linkchecker-out.sql
#dbname=linksdb
#commandsep=;
filename=linkchecker-out.sql
dbname=linksdb
commandsep=;
# HTML logger
[html]
#filename=linkchecker-out.html
filename=linkchecker-out.html
# colors for the various parts
#colorbackground="#ffffff"
#colorurl=blue
#colorborder=
#colorlink=
#tablewarning=
#tableok=
#tableerror=
colorbackground="#fff7e5"
colorurl="#dcd5cf"
colorborder="#000000"
colorlink="#191c83"
tablewarning=<td bgcolor="#e0954e">
tableerror=<td bgcolor="#db4930">
tableok=<td bgcolor="#3ba557">
# ANSI color logger
[colored]
#filename=linkchecker-out.ansi
# colors for the various parts
#colorparent=
#colorurl=
#colorname=
#colorreal=
#colorbase=
#colorvalid=
#colorinvalid=
#colorinfo=
#colorwarning=
#colordltime=
#colorreset=
filename=linkchecker-out.ansi
# colors for the various parts (\x1b = ESC)
colorparent="\x1b[37m"
colorurl="\x1b[0m"
colorname="\x1b[0m"
colorreal="\x1b[36m"
colorbase="\x1b[35m"
colorvalid="\x1b[1;32m"
colorinvalid="\x1b[1;31m"
colorinfo="\x1b[0m"
colorwarning="\x1b[1;33m"
colordltime="\x1b[0m"
colorreset="\x1b[0m"
# blacklist logger
[blacklist]
#filename=~/.blacklist
filename=~/.blacklist
# checking options
[checking]
# number of threads
#threads=5
threads=5
# check anchors?
#anchors=0
#recursionlevel=1
anchors=0
recursionlevel=1
# obey robots.txt exclusion?
#robotstxt=0
robotstxt=1
# overall strict checking. You can specify for each extern URL
# separately if its strict or not. See the [filtering] section
#strict=0
strict=0
# supply a regular expression for which warnings are printed if found
# in any HTML files.
#warningregex="Request failed"
warningregex=
# Basic NNTP server. Overrides NNTP_SERVER environment variable.
#nntpserver=news.uni-stuttgart.de
nntpserver=
# filtering options (see FAQ)
# for each extern link we can specify if it is strict or not
@ -98,7 +105,7 @@
# internlinks=calvinsplayground\.de
# check only syntax of all mail adresses
# extern3=^mailto: 1
#denyallow=0
denyallow=0
# You can provide different user/password pairs for different link types.
# Entries are a triple (link regular expression, username, password),

View file

@ -1,3 +1,2 @@
*.result
*.prof
*.pyc

1
test/__init__.py Normal file
View file

@ -0,0 +1 @@
# Dummy file to make this directory a package.

View file

@ -1,8 +0,0 @@
<!-- base without href -->
<base target="_top">
<!-- meta url -->
<META HTTP-equiv="refresh" content="0; url=http://www.calvinandhobbes.com/">
<!-- spaces between key and value -->
<a href
=
"file:/etc">

View file

@ -1,5 +0,0 @@
<!-- base with href -->
<base href="file:/etc/">
<!-- good file -->
<a href="passwd">

View file

@ -1,5 +0,0 @@
<!-- frame src urls -->
<frameset border="0" frameborder="0" framespacing="0">
<frame name="top" src="test1.html" frameborder="0">
<frame name="bottom" src="test2.html" frameborder="0">
</frameset>

View file

@ -1,20 +0,0 @@
<!-- extra mail checking -->
<html><head></head>
<body>
<!-- legal -->
<a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>
<a href="mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla">2</a>
<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com">3</a>
<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>">4</a>
<a href="mailto:">6</a>
<a href="mailto:o'hara@cs.uni-sb.de">5</a>
<a href="mailto:?to=calvin@studcs.uni-sb.de?subject=blubb">...</a>
<a href="mailto:jan@jan-dittberner.de?subject=test">...</a>
<!-- illegal -->
<!-- contains non-quoted characters -->
<a href="mailto:a@d?subject=äöü">5</a>
<a href="mailto:calvin@cs.uni-sb.de?subject=Halli hallo">_</a>
<!-- ? extension forbidden in <> construct -->
<a href="mailto:Bastian Kleineidam <calvin@host1?foo=bar>">3</a>
</body>
</html>

View file

@ -1,19 +0,0 @@
<!-- news testing -->
<a href="news:comp.os.linux.misc">
<!-- snews -->
<a href="snews:de.comp.os.unix.linux.misc">
<!-- no group -->
<a href="news:">
<!-- illegal syntax -->
<a href="news:§$%&/´`(§%">
<!-- nttp scheme with host -->
<a href="nntp://news.rz.uni-sb.de/comp.lang.python">
<!-- article span -->
<a href="nntp://news.rz.uni-sb.de/comp.lang.python/1-5">
<!-- article number -->
<a href="nntp://news.rz.uni-sb.de/EFGJG4.7A@deshaw.com">
<!-- host but no group -->
<a href="nntp://news.rz.uni-sb.de/">
<!-- article span -->
<a href="news:comp.lang.python/1-5">

10
test/output/test_base Normal file
View file

@ -0,0 +1,10 @@
test_base
url file:///home/calvin/projects/linkchecker/test/output/base1.html
realurl file:/home/calvin/projects/linkchecker/test/output/base1.html
error Error: [Errno 2] No such file or directory: '/home/calvin/projects/linkchecker/test/output/base1.html'
url file:///home/calvin/projects/linkchecker/test/output/base2.html
realurl file:/home/calvin/projects/linkchecker/test/output/base2.html
error Error: [Errno 2] No such file or directory: '/home/calvin/projects/linkchecker/test/output/base2.html'
url file:///home/calvin/projects/linkchecker/test/output/base3.html
realurl file:/home/calvin/projects/linkchecker/test/output/base3.html
error Error: [Errno 2] No such file or directory: '/home/calvin/projects/linkchecker/test/output/base3.html'

248
test/regrtest.py Executable file
View file

@ -0,0 +1,248 @@
#!/usr/bin/env python2
# this file is _not_ the original Python2 regression test suite.
"""Bastis Regression test.
This will find all modules whose name is "test_*" in the test
directory, and run them. Various command line options provide
additional facilities.
Command line options:
-v, --verbose
run tests in verbose mode with output to stdout
-q, --quiet
don't print anything except if a test fails
-g, --generate
write the output file for a test instead of comparing it
-x, --exclude
arguments are tests to *exclude*
-r, --random
randomize test execution order
If non-option arguments are present, they are names for tests to run,
unless -x is given, in which case they are names for tests not to run.
If no test names are given, all tests are run.
-v is incompatible with -g and does not compare test output files.
"""
import sys,getopt,os,string
import test_support
def main(tests=None, testdir=None, verbose=0, quiet=0, generate=0,
exclude=0, randomize=0):
"""Execute a test suite.
This also parses command-line options and modifies its behavior
accordingly.
tests -- a list of strings containing test names (optional)
testdir -- the directory in which to look for tests (optional)
Users other than the Python test suite will certainly want to
specify testdir; if it's omitted, the directory containing the
Python test suite is searched for.
If the tests argument is omitted, the tests listed on the
command-line will be used. If that's empty, too, then all *.py
files beginning with test_ will be used.
The other seven default arguments (verbose, quiet, generate, exclude,
single, randomize, and findleaks) allow programmers calling main()
directly to set the values that would normally be set by flags on the
command line.
"""
try:
opts, args = getopt.getopt(sys.argv[1:],
'vgqxsrl',
['verbose',
'generate',
'quiet',
'exclude',
'random',])
except getopt.error, msg:
error(msg)
usage()
return -1
for opt, val in opts:
if opt in ('-v','--verbose'): verbose = verbose + 1
if opt in ('-q','--quiet'): quiet = 1; verbose = 0
if opt in ('-g','--generate'): generate = 1
if opt in ('-x','--exclude'): exclude = 1
if opt in ('-r','--random'): randomize = 1
if generate and verbose:
print "-g and -v don't go together!"
return 2
good = []
bad = []
skipped = []
for i in range(len(args)):
# Strip trailing ".py" from arguments
if args[i][-3:] == '.py':
args[i] = args[i][:-3]
stdtests = STDTESTS[:]
nottests = NOTTESTS[:]
if exclude:
for arg in args:
if arg in stdtests:
stdtests.remove(arg)
nottests[:0] = args
args = []
tests = tests or args or findtests(testdir, stdtests, nottests)
if randomize:
random.shuffle(tests)
test_support.verbose = verbose # Tell tests to be moderately quiet
save_modules = sys.modules.keys()
for test in tests:
if not quiet:
print test
ok = runtest(test, generate, verbose, quiet, testdir)
if ok > 0:
good.append(test)
elif ok == 0:
bad.append(test)
else:
skipped.append(test)
# Unload the newly imported modules (best effort finalization)
for module in sys.modules.keys():
if module not in save_modules and module.startswith("test."):
test_support.unload(module)
if good and not quiet:
if not bad and not skipped and len(good) > 1:
print "All",
print count(len(good), "test"), "OK."
if bad:
print count(len(bad), "test"), "failed:",
print string.join(bad)
if skipped and not quiet:
print count(len(skipped), "test"), "skipped:",
print string.join(skipped)
return len(bad) > 0
STDTESTS = [
'test_base',
# 'test_frames',
]
NOTTESTS = [
'test_support',
]
def findtests(testdir=None, stdtests=STDTESTS, nottests=NOTTESTS):
"""Return a list of all applicable test modules."""
if not testdir: testdir = findtestdir()
names = os.listdir(testdir)
tests = []
for name in names:
if name[:5] == "test_" and name[-3:] == ".py":
modname = name[:-3]
if modname not in stdtests and modname not in nottests:
tests.append(modname)
tests.sort()
return stdtests + tests
def runtest(test, generate, verbose, quiet, testdir = None):
"""Run a single test.
test -- the name of the test
generate -- if true, generate output, instead of running the test
and comparing it to a previously created output file
verbose -- if true, print more messages
quiet -- if true, don't print 'skipped' messages (probably redundant)
testdir -- test directory
"""
test_support.unload(test)
if not testdir: testdir = findtestdir()
outputdir = os.path.join(testdir, "output")
outputfile = os.path.join(outputdir, test)
try:
if generate:
cfp = open(outputfile, "w")
elif verbose:
cfp = sys.stdout
else:
cfp = Compare(outputfile)
except IOError:
cfp = None
print "Warning: can't open", outputfile
try:
save_stdout = sys.stdout
try:
if cfp:
sys.stdout = cfp
print test # Output file starts with test name
__import__(test, globals(), locals(), [])
if cfp and not (generate or verbose):
cfp.close()
finally:
sys.stdout = save_stdout
except (ImportError, test_support.TestSkipped), msg:
if not quiet:
print "test", test,
print "skipped -- ", msg
return -1
except KeyboardInterrupt:
raise
except test_support.TestFailed, msg:
print "test", test, "failed --", msg
return 0
except:
type, value = sys.exc_info()[:2]
print "test", test, "crashed --", str(type) + ":", value
if verbose:
traceback.print_exc(file=sys.stdout)
return 0
else:
return 1
def findtestdir():
if __name__ == '__main__':
file = sys.argv[0]
else:
file = __file__
testdir = os.path.dirname(file) or os.curdir
return testdir
def count(n, word):
if n == 1:
return "%d %s" % (n, word)
else:
return "%d %ss" % (n, word)
class Compare:
def __init__(self, filename):
self.fp = open(filename, 'r')
def write(self, data):
expected = self.fp.read(len(data))
if data <> expected:
raise test_support.TestFailed, \
'Writing: '+`data`+', expected: '+`expected`
def writelines(self, listoflines):
map(self.write, listoflines)
def flush(self):
pass
def close(self):
leftover = self.fp.read()
if leftover:
raise test_support.TestFailed, 'Unread: '+`leftover`
self.fp.close()
def isatty(self):
return 0
if __name__ == '__main__':
sys.exit(main())

View file

@ -1,23 +0,0 @@
Just some HTTP links
<a href="http://www.garantiertnixgutt.bla">bad url</a>
<a href="http://www.heise.de">ok</a>
<a href="http:/www.heise.de">one slash</a>
<a href="http:www.heise.de">no slash</a>
<a href="http://">no url</a>
<a href="http:/">no url, one slash</a>
<a href="http:">no url, no slash</a>
<a href="http://www.blubb.de/stalter&sohn">unquoted ampersand</a>
<a name="iswas">anchor for test2.html</a>
<a href=http://slashdot.org/>unquoted</a>
<a href="http://treasure.calvinsplayground.de/~calvin/software/#isnix"
>invalid anchor</a>
<a href="http://treasure.calvinsplayground.de/~calvin/isnich/"
>authorization (user=calvin, pass=calvin)</a>
<a href="https://www.heise.de">https</a>
<a href="HtTP://WWW.hEIsE.DE">should be cached</a>
<a href="HTTP://WWW.HEISE.DE">should be cached</a>
<!-- <a href=http://nocheckin> no check because of comment -->
<a href=illegalquote1">no beginning quote</a>
<a href="illegalquote2>no ending quote</a>
<!-- check the parser at end of file -->
<a href="g

View file

@ -1,19 +0,0 @@
<!-- meta url -->
<meta http-equiv="refresh" content="5; url=http://localhost">
<a href="hutzli:nixgutt"> <!-- bad scheme -->
<a href="javascript:loadthis()"> <!-- javascript (ignore) -->
<a href="file:///etc/group"> <!-- good file -->
<a href="file://etc/group"> <!-- bad file -->
<a href="file:/etc/group"> <!-- good file -->
<a href="file:etc/group"> <!-- bad file -->
<a href="file:/etc/"> <!-- good dir -->
<a href="test1.html"> <!-- relative url -->
<a href="test1.html#isnix"> <!-- bad anchor -->
<a href="test1.html#iswas"> <!-- good anchor -->
<a href="telnet:localhost"> <!-- telnet to localhost -->
<a href="telnet:"> <!-- telnet without host -->
<a href="ftp:/treasure.calvinsplayground.de/pub"> <!-- ftp one slash -->
<a href="ftp://treasure.calvinsplayground.de/pub"> <!-- ftp two slashes -->
<a href="ftp://treasure.calvinsplayground.de//pub"> <!-- ftp two dir slashes -->
<a href="ftp://treasure.calvinsplayground.de////////pub"> <!-- ftp many dir slashes -->
<a href="ftp:///treasure.calvinsplayground.de/pub"> <!-- ftp three slashes -->

14
test/test_base.py Normal file
View file

@ -0,0 +1,14 @@
import os,sys
sys.path.append(os.getcwd())
import linkcheck
config = linkcheck.Config.Configuration()
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1
config["verbose"] = 1
config.disableThreading()
htmldir = "test/html"
for file in ('base1.html','base2.html','base3.html'):
url = os.path.join(htmldir, file)
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
linkcheck.checkUrls(config)

72
test/test_support.py Normal file
View file

@ -0,0 +1,72 @@
"""Supporting definitions for the Python regression test."""
class Error(Exception):
"""Base class for regression test exceptions."""
class TestFailed(Error):
"""Test failed."""
class TestSkipped(Error):
"""Test skipped.
This can be raised to indicate that a test was deliberatly
skipped, but not because a feature wasn't available. For
example, if some resource can't be used, such as the network
appears to be unavailable, this should be raised instead of
TestFailed.
"""
verbose = 1 # Flag set to 0 by regrtest.py
def unload(name):
import sys
try:
del sys.modules[name]
except KeyError:
pass
def forget(modname):
unload(modname)
import sys, os
for dirname in sys.path:
try:
os.unlink(os.path.join(dirname, modname + '.pyc'))
except os.error:
pass
FUZZ = 1e-6
def fcmp(x, y): # fuzzy comparison function
if type(x) == type(0.0) or type(y) == type(0.0):
try:
x, y = coerce(x, y)
fuzz = (abs(x) + abs(y)) * FUZZ
if abs(x-y) <= fuzz:
return 0
except:
pass
elif type(x) == type(y) and type(x) in (type(()), type([])):
for i in range(min(len(x), len(y))):
outcome = fcmp(x[i], y[i])
if outcome <> 0:
return outcome
return cmp(len(x), len(y))
return cmp(x, y)
TESTFN = '@test' # Filename used for testing
from os import unlink
def findfile(file, here=__file__):
import os
if os.path.isabs(file):
return file
import sys
path = sys.path
path = [os.path.dirname(here)] + path
for dn in path:
fn = os.path.join(dn, file)
if os.path.exists(fn): return fn
return file