git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@419 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2002-05-05 15:02:58 +00:00
parent 94abc8d989
commit fd0b4d2bcb
19 changed files with 182 additions and 126 deletions

22
FAQ
View file

@ -19,9 +19,9 @@ A: The difference between LinkChecker and web browsers is the type of HTTP
Q: How can I tell LinkChecker which proxy to use?
A: LinkChecker works transparently with proxies. In a Unix or Windows
environment, set the http_proxy, ftp_proxy or gopher_proxy environment
variables to a URL that identifies the proxy server before starting
LinkChecker. For example
environment, set the http_proxy, https_proxy, ftp_proxy or gopher_proxy
environment variables to a URL that identifies the proxy server before
starting LinkChecker. For example
# http_proxy="http://www.someproxy.com:3128"
# export http_proxy
@ -94,3 +94,19 @@ A: Cookies can not store more information as is in the HTTP request itself,
This could be used to "track" subsequent requests to this server.
Cookies are only stored in memory. After LinkChecker finishes, they
are lost. So the tracking is restricted to the checking time.
Q: I want to have my own logging class. How can I use it in LinkChecker?
A: Currently, only a Python API lets you define new logging classes.
Define your own logging class as a subclass of linkcheck.Logging.Logger
or any other standard logging class in that module.
Then call the addLogger function in Config.Configuration to register
your new Logger.
After this append a new Logging instance to the fileoutput.
import linkcheck, MyLogger
log_format = 'mylog'
log_args = {'fileoutput': log_format, 'filename': 'foo.txt'}
cfg = linkcheck.Config.Configuration()
cfg.addLogger(log_format, MyLogger.MyLogger)
cfg['fileoutput'].append(cfg.newLogger(log_format, log_args))

9
debian/changelog vendored
View file

@ -1,8 +1,13 @@
linkchecker (1.5.0) unstable; urgency=low
* More syntax checking for host:port network locations
* More syntax checking for host:port network locations (from 1.4.4)
* NntpUrlData.py: prevent endless loops with busy NNTP servers sending
504 or 505 error status. (from 1.4.5)
* Cookie support
* Python API for custom loggers
* Move TestLogger into test_support
-- Bastian Kleineidam <calvin@debian.org> Sat, 4 May 2002 00:21:45 +0200
-- Bastian Kleineidam <calvin@debian.org> Sun, 5 May 2002 14:11:01 +0200
linkchecker (1.4.3) unstable; urgency=low

View file

@ -38,6 +38,7 @@ Freeware = AppName+""" comes with ABSOLUTELY NO WARRANTY!
This is free software, and you are welcome to redistribute it
under certain conditions. Look at the file `LICENSE' within this
distribution."""
# default logger classes
Loggers = {
"text": Logging.StandardLogger,
"html": Logging.HtmlLogger,
@ -47,7 +48,6 @@ Loggers = {
"csv": Logging.CSVLogger,
"blacklist": Logging.BlacklistLogger,
"xml": Logging.XMLLogger,
"test": Logging.TestLogger,
}
# for easy printing: a comma separated logger list
LoggerKeys = reduce(lambda x, y: x+", "+y, Loggers.keys())
@ -93,7 +93,7 @@ class Configuration (UserDict.UserDict):
"""Initialize the default options"""
UserDict.UserDict.__init__(self)
self.reset()
# we use this variable to delay the calling of
# we use "reduceCount" to delay the calling of
# Threader.reduceThreads() because we would call it too often.
# Therefore we count this variable up to 5 and then we call
# reduceThreads(). Ok, this is a hack but ItWorksForMe(tm).
@ -166,7 +166,6 @@ class Configuration (UserDict.UserDict):
self['xml'] = {
"filename": "linkchecker-out.xml",
}
self['test'] = {} # no args for test logger
self['log'] = self.newLogger('text')
self["quiet"] = 0
self["warningregex"] = None
@ -304,6 +303,12 @@ class Configuration (UserDict.UserDict):
args.update(dict)
return apply(Loggers[logtype], (), args)
def addLogger(self, logtype, loggerClass, logargs={}):
"add a new logger type"
global Loggers
Loggers[logtype] = loggerClass
self[logtype] = logargs
def incrementLinknumber_NoThreads (self):
self['linknumber'] += 1

View file

@ -1,6 +1,6 @@
# __init__.py for DNS class.
class Error(Exception):
class Error (Exception):
def __str__ (self):
return 'DNS API error'

View file

@ -46,7 +46,7 @@ EntityTable = {
'\'': '&apos;',
}
def quote(s):
def quote (s):
res = list(s)
for i in range(len(res)):
c = res[i]
@ -54,35 +54,35 @@ def quote(s):
return ''.join(res)
# return formatted time
def _strtime(t):
def _strtime (t):
return time.strftime("%d.%m.%Y %H:%M:%S", time.localtime(t))
class Logger:
def __init__(self, **args):
def __init__ (self, **args):
self.logfields = None # all fields
if args.has_key('fields'):
if "all" not in args['fields']:
self.logfields = args['fields']
def logfield(self, name):
def logfield (self, name):
if self.logfields is None:
return 1
return name in self.logfields
def init(self):
def init (self):
raise Exception, "abstract function"
def newUrl(self, urlData):
def newUrl (self, urlData):
raise Exception, "abstract function"
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
raise Exception, "abstract function"
class StandardLogger(Logger):
class StandardLogger (Logger):
"""Standard text logger.
Every Logger has to implement the following functions:
@ -116,7 +116,7 @@ __init__(self, **args)
Unknown keywords will be ignored.
"""
def __init__(self, **args):
def __init__ (self, **args):
apply(Logger.__init__, (self,), args)
self.errors = 0
#self.warnings = 0
@ -128,7 +128,7 @@ __init__(self, **args)
self.fd = sys.stdout
def init(self):
def init (self):
if self.fd is None: return
self.starttime = time.time()
if self.logfield('intro'):
@ -139,7 +139,7 @@ __init__(self, **args)
self.fd.flush()
def newUrl(self, urlData):
def newUrl (self, urlData):
if self.fd is None: return
if self.logfield('url'):
self.fd.write("\n"+linkcheck._(LogFields['url'])+Spaces['url']+urlData.urlName)
@ -186,7 +186,7 @@ __init__(self, **args)
self.fd.flush()
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
if self.fd is None: return
if self.logfield('outro'):
self.fd.write(linkcheck._("\nThats it. "))
@ -232,10 +232,10 @@ HTML_HEADER = """<!DOCTYPE html PUBLIC "-//W3C//DTD html 4.0//EN">
<body bgcolor=%s link=%s vlink=%s alink=%s>
"""
class HtmlLogger(StandardLogger):
class HtmlLogger (StandardLogger):
"""Logger with HTML output"""
def __init__(self, **args):
def __init__ (self, **args):
apply(StandardLogger.__init__, (self,), args)
self.colorbackground = args['colorbackground']
self.colorurl = args['colorurl']
@ -245,7 +245,7 @@ class HtmlLogger(StandardLogger):
self.tableerror = args['tableerror']
self.tableok = args['tableok']
def init(self):
def init (self):
if self.fd is None: return
self.starttime = time.time()
self.fd.write(HTML_HEADER%(Config.App, self.colorbackground,
@ -258,7 +258,7 @@ class HtmlLogger(StandardLogger):
self.fd.flush()
def newUrl(self, urlData):
def newUrl (self, urlData):
if self.fd is None: return
self.fd.write('<table align=left border=0 cellspacing=0'
' cellpadding=1 bgcolor='+self.colorborder+' summary=Border'
@ -321,7 +321,7 @@ class HtmlLogger(StandardLogger):
self.fd.flush()
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
if self.fd is None: return
if self.logfield("outro"):
self.fd.write(linkcheck._("\nThats it. "))
@ -362,10 +362,10 @@ class HtmlLogger(StandardLogger):
self.fd = None
class ColoredLogger(StandardLogger):
class ColoredLogger (StandardLogger):
"""ANSI colorized output"""
def __init__(self, **args):
def __init__ (self, **args):
esc="\x1b[%sm"
apply(StandardLogger.__init__, (self,), args)
self.colorparent = esc % args['colorparent']
@ -382,7 +382,7 @@ class ColoredLogger(StandardLogger):
self.currentPage = None
self.prefix = 0
def newUrl(self, urlData):
def newUrl (self, urlData):
if self.fd is None: return
if self.logfield("parenturl"):
if urlData.parentName:
@ -474,7 +474,7 @@ class ColoredLogger(StandardLogger):
self.fd.flush()
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
if self.fd is None: return
if self.logfield("outro"):
if self.prefix:
@ -483,16 +483,16 @@ class ColoredLogger(StandardLogger):
class GMLLogger(StandardLogger):
class GMLLogger (StandardLogger):
"""GML means Graph Modeling Language. Use a GML tool to see
your sitemap graph.
"""
def __init__(self, **args):
def __init__ (self, **args):
apply(StandardLogger.__init__, (self,), args)
self.nodes = {}
self.nodeid = 0
def init(self):
def init (self):
if self.fd is None: return
self.starttime = time.time()
if self.logfield("intro"):
@ -505,7 +505,7 @@ class GMLLogger(StandardLogger):
self.fd.flush()
def newUrl(self, urlData):
def newUrl (self, urlData):
"""write one node and all possible edges"""
if self.fd is None: return
node = urlData
@ -527,7 +527,7 @@ class GMLLogger(StandardLogger):
self.writeEdges()
def writeEdges(self):
def writeEdges (self):
"""write all edges we can find in the graph in a brute-force
manner. Better would be a mapping of parent urls.
"""
@ -545,7 +545,7 @@ class GMLLogger(StandardLogger):
self.fd.flush()
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
if self.fd is None: return
self.fd.write("]\n")
if self.logfield("outro"):
@ -566,15 +566,15 @@ class GMLLogger(StandardLogger):
class XMLLogger(StandardLogger):
class XMLLogger (StandardLogger):
"""XML output mirroring the GML structure. Easy to parse with any XML
tool."""
def __init__(self, **args):
def __init__ (self, **args):
apply(StandardLogger.__init__, (self,), args)
self.nodes = {}
self.nodeid = 0
def init(self):
def init (self):
if self.fd is None: return
self.starttime = time.time()
self.fd.write('<?xml version="1.0"?>\n')
@ -589,7 +589,7 @@ class XMLLogger(StandardLogger):
self.fd.write('<GraphXML>\n<graph isDirected="true">\n')
self.fd.flush()
def newUrl(self, urlData):
def newUrl (self, urlData):
"""write one node and all possible edges"""
if self.fd is None: return
node = urlData
@ -615,7 +615,7 @@ class XMLLogger(StandardLogger):
self.fd.write(" </node>\n")
self.writeEdges()
def writeEdges(self):
def writeEdges (self):
"""write all edges we can find in the graph in a brute-force
manner. Better would be a mapping of parent urls.
"""
@ -636,7 +636,7 @@ class XMLLogger(StandardLogger):
self.fd.write(" </edge>\n")
self.fd.flush()
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
if self.fd is None: return
self.fd.write("</graph>\n</GraphXML>\n")
if self.logfield("outro"):
@ -658,15 +658,15 @@ class XMLLogger(StandardLogger):
class SQLLogger(StandardLogger):
class SQLLogger (StandardLogger):
""" SQL output for PostgreSQL, not tested"""
def __init__(self, **args):
def __init__ (self, **args):
apply(StandardLogger.__init__, (self,), args)
self.dbname = args['dbname']
self.separator = args['separator']
def init(self):
def init (self):
if self.fd is None: return
self.starttime = time.time()
if self.logfield("intro"):
@ -677,7 +677,7 @@ class SQLLogger(StandardLogger):
Config.Email))
self.fd.flush()
def newUrl(self, urlData):
def newUrl (self, urlData):
if self.fd is None: return
self.fd.write("insert into %s(urlname,recursionlevel,parentname,"
"baseref,errorstring,validstring,warningstring,infoString,"
@ -702,7 +702,7 @@ class SQLLogger(StandardLogger):
self.separator))
self.fd.flush()
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
if self.fd is None: return
if self.logfield("outro"):
self.stoptime = time.time()
@ -721,28 +721,28 @@ class SQLLogger(StandardLogger):
self.fd = None
class BlacklistLogger(Logger):
class BlacklistLogger (Logger):
"""Updates a blacklist of wrong links. If a link on the blacklist
is working (again), it is removed from the list. So after n days
we have only links on the list which failed for n days.
"""
def __init__(self, **args):
def __init__ (self, **args):
apply(Logger.__init__, (self,), args)
self.errors = 0
self.blacklist = {}
self.filename = args['filename']
def init(self):
def init (self):
pass
def newUrl(self, urlData):
def newUrl (self, urlData):
if urlData.valid:
self.blacklist[urlData.getCacheKey()] = None
elif not urlData.cached:
self.errors = 1
self.blacklist[urlData.getCacheKey()] = urlData
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
"""write the blacklist"""
fd = open(self.filename, "w")
for url in self.blacklist.keys():
@ -750,15 +750,15 @@ class BlacklistLogger(Logger):
fd.write(url+"\n")
class CSVLogger(StandardLogger):
class CSVLogger (StandardLogger):
""" CSV output. CSV consists of one line per entry. Entries are
separated by a semicolon.
"""
def __init__(self, **args):
def __init__ (self, **args):
apply(StandardLogger.__init__, (self,), args)
self.separator = args['separator']
def init(self):
def init (self):
if self.fd is None: return
self.starttime = time.time()
if self.logfield("intro"):
@ -785,7 +785,7 @@ class CSVLogger(StandardLogger):
"# cached;\n")
self.fd.flush()
def newUrl(self, urlData):
def newUrl (self, urlData):
if self.fd is None: return
self.fd.write(
"%s%s%d%s%s%s%s%s%s%s%s%s%s%s%s%s%d%s%s%s%d%s%s%s%d%s%d%s%d\n" % (
@ -807,7 +807,7 @@ class CSVLogger(StandardLogger):
self.fd.flush()
def endOfOutput(self, linknumber=-1):
def endOfOutput (self, linknumber=-1):
if self.fd is None: return
self.stoptime = time.time()
if self.logfield("outro"):
@ -824,29 +824,3 @@ class CSVLogger(StandardLogger):
self.fd.flush()
self.fd = None
class TestLogger(Logger):
""" Output for regression test """
def init(self):
pass
def newUrl(self, urlData):
print 'url',urlData.urlName
if urlData.cached:
print "cached"
if urlData.name:
print "name",urlData.name
if urlData.baseRef:
print "baseurl",urlData.baseRef
if urlData.infoString:
print "info",urlData.infoString
if urlData.warningString:
print "warning",urlData.warningString
if urlData.valid:
print "valid"
else:
print "error"
def endOfOutput(self, linknumber=-1):
pass

View file

@ -388,21 +388,34 @@ class UrlData:
if not (self.config["externlinks"] or self.config["internlinks"]):
return 0
# deny and allow external checking
Config.debug(HURT_ME_PLENTY, "Url", self.url)
if self.config["denyallow"]:
for pat, strict in self.config["externlinks"]:
if pat.search(self.url):
return (1, strict)
for pat in self.config["internlinks"]:
if pat.search(self.url):
return 0
for entry in self.config["externlinks"]:
Config.debug(HURT_ME_PLENTY, "Extern entry", entry)
match = entry['pattern'].search(self.url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (1, entry['strict'])
for entry in self.config["internlinks"]:
Config.debug(HURT_ME_PLENTY, "Intern entry", entry)
match = entry['pattern'].search(self.url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return 1
return 0
else:
for pat in self.config["internlinks"]:
if pat.search(self.url):
for entry in self.config["internlinks"]:
Config.debug(HURT_ME_PLENTY, "Intern entry", entry)
match = entry['pattern'].search(self.url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return 0
for pat, strict in self.config["externlinks"]:
if pat.search(self.url):
return (1, strict)
for entry in self.config["externlinks"]:
Config.debug(HURT_ME_PLENTY, "Extern entry", entry)
match = entry['pattern'].search(self.url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (1, entry['strict'])
return (1,0)
raise linkcheck.error, "internal error in UrlData._getExtern"

View file

@ -1,13 +1,13 @@
"""Supporting definitions for the Python regression test."""
import linkcheck
class Error(Exception):
class Error (Exception):
"""Base class for regression test exceptions."""
class TestFailed(Error):
class TestFailed (Error):
"""Test failed."""
class TestSkipped(Error):
class TestSkipped (Error):
"""Test skipped.
This can be raised to indicate that a test was deliberatly
@ -21,14 +21,14 @@ class TestSkipped(Error):
verbose = 1 # Flag set to 0 by regrtest.py
def unload(name):
def unload (name):
import sys
try:
del sys.modules[name]
except KeyError:
pass
def forget(modname):
def forget (modname):
unload(modname)
import sys, os
for dirname in sys.path:
@ -39,7 +39,7 @@ def forget(modname):
FUZZ = 1e-6
def fcmp(x, y): # fuzzy comparison function
def fcmp (x, y): # fuzzy comparison function
if type(x) == type(0.0) or type(y) == type(0.0):
try:
x, y = coerce(x, y)
@ -59,7 +59,7 @@ def fcmp(x, y): # fuzzy comparison function
TESTFN = '@test' # Filename used for testing
from os import unlink
def findfile(file, here=__file__):
def findfile (file, here=__file__):
import os
if os.path.isabs(file):
return file
@ -70,3 +70,29 @@ def findfile(file, here=__file__):
fn = os.path.join(dn, file)
if os.path.exists(fn): return fn
return file
class TestLogger (linkcheck.Logging.Logger):
""" Output for regression test """
def init (self):
pass
def newUrl (self, urlData):
print 'url',urlData.urlName
if urlData.cached:
print "cached"
if urlData.name:
print "name",urlData.name
if urlData.baseRef:
print "baseurl",urlData.baseRef
if urlData.infoString:
print "info",urlData.infoString
if urlData.warningString:
print "warning",urlData.warningString
if urlData.valid:
print "valid"
else:
print "error"
def endOfOutput (self, linknumber=-1):
pass

View file

@ -125,8 +125,8 @@ o If you have your system configured to automatically establish a
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker uses -t0.
o You can supply multiple user/password pairs in a configuration file.
o Cookies are not accepted by LinkChecker.
o To use proxies set $http_proxy, $https_proxy on Unix or Windows.
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
on Unix or Windows.
On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
same as the host of the user browsing your pages!
@ -161,6 +161,19 @@ def printUsage (msg):
sys.exit(1)
def getLinkPat (arg):
if arg[0:1] == '!':
pattern = arg[1:]
negate = 1
else:
pattern = arg
negate = 0
return {
"pattern": re.compile(pattern),
"negate": negate,
"strict": 0,
}
# Read command line arguments
try:
# Note: cut out the name of the script
@ -220,7 +233,7 @@ for opt,arg in options:
config["anchors"] = 1
elif opt=="-e" or opt=="--extern":
config["externlinks"].append((re.compile(arg), 0))
config["externlinks"].append(getLinkPat(arg))
elif opt=="-h" or opt=="--help":
printHelp()
@ -248,7 +261,7 @@ for opt,arg in options:
config['interactive'] = 1
elif opt=="-i" or opt=="--intern":
config["internlinks"].append(re.compile(arg))
config["internlinks"].append(getLinkPat(arg))
elif opt=="-l" or opt=="--denyallow":
config["denyallow"] = 1

View file

@ -1 +1 @@
# Dummy file to make this directory a package.
"Dummy file to make this directory a package."

View file

@ -14,13 +14,17 @@ url news:
warning No NNTP server specified, skipping this URL
valid
url nntp://news.rz.uni-sb.de/comp.lang.python
warning NNTP busy: 505 Connection rejected, you're making too many connects per minute
error
url nntp://news.rz.uni-sb.de/comp.lang.python/1-5
warning NNTP busy: 505 Connection rejected, you're making too many connects per minute
error
url nntp://news.rz.uni-sb.de/EFGJG4.7A@deshaw.com
warning NNTP busy: 505 Connection rejected, you're making too many connects per minute
error
url nntp://news.rz.uni-sb.de/
warning No newsgroup specified in NNTP URL
warning NNTP busy: 505 Connection rejected, you're making too many connects per minute
No newsgroup specified in NNTP URL
valid
url news:comp.lang.python/1-5
warning No NNTP server specified, skipping this URL

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1

View file

@ -1,6 +1,6 @@
import os
import linkcheck
import os, linkcheck
config = linkcheck.Config.Configuration()
config.addLogger('test', linkcheck.test_support.TestLogger)
config['recursionlevel'] = 1
config['log'] = config.newLogger('test')
config["anchors"] = 1