diff --git a/Makefile b/Makefile index c53e85f6..58fda10f 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ VERSION=$(shell python setup.py --version) PACKAGE = linkchecker NAME = $(shell python setup.py --name) HOST=fsinfo.cs.uni-sb.de -LCOPTS=-ocolored -Ftext -Fhtml -Fgml -Fsql -Fcsv -R -t0 -v +LCOPTS=-ocolored -Ftext -Fhtml -Fgml -Fsql -Fcsv -Fxml -R -t0 -v DEBPACKAGE = $(PACKAGE)_$(VERSION)_i386.deb SOURCES = \ linkcheck/Config.py \ diff --git a/README b/README index 6f1b1843..548b3653 100644 --- a/README +++ b/README @@ -26,14 +26,16 @@ Installing, Requirements, Running Read the file INSTALL. -License --------- +License and Credits +------------------- LinkChecker is licensed under the GNU Public License. Credits go to Guido van Rossum for making Python. His hovercraft is full of eels! As this program is directly derived from my Java link checker, additional credits go to Robert Forsman (the author of JCheckLinks) and his robots.txt parse algorithm. +Nicolas Chauvat supplied a patch for +an XML output logger. I want to thank everybody who gave me feedback, bug reports and suggestions. diff --git a/debian/changelog b/debian/changelog index 20ff52eb..e74d3a0e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -2,8 +2,10 @@ linkchecker (1.2.8) unstable; urgency=low * INSTALL: more documentation for the CGI scripts * Makefile: better cleaning (clean, cleandeb, distclean) + * XML output (idea and patch from Nicolas Chauvat + ) - -- Bastian Kleineidam Fri, 10 Nov 2000 11:54:25 +0100 + -- Bastian Kleineidam Wed, 15 Nov 2000 23:27:37 +0100 linkchecker (1.2.7) unstable; urgency=low diff --git a/debian/dirs b/debian/dirs index 3a9f7bd7..ee19d5d1 100644 --- a/debian/dirs +++ b/debian/dirs @@ -1,5 +1 @@ -usr/lib/python1.5/site-packages/DNS -usr/lib/python1.5/site-packages/linkcheck -usr/bin etc -usr/share/doc/linkchecker diff --git a/linkcheck/Config.py b/linkcheck/Config.py index 09085d01..1456664c 100644 --- a/linkcheck/Config.py +++ b/linkcheck/Config.py @@ -53,6 +53,7 @@ Loggers = { "sql": Logging.SQLLogger, "csv": Logging.CSVLogger, "blacklist": Logging.BlacklistLogger, + "xml": Logging.XMLLogger, } # for easy printing: a comma separated logger list LoggerKeys = reduce(lambda x, y: x+", "+y, Loggers.keys()) @@ -143,6 +144,9 @@ class Configuration(UserDict.UserDict): self.data['blacklist'] = { "filename": "~/.blacklist", } + self.data['xml'] = { + "filename": "linkchecker-out.xml", + } # default values self.data['log'] = self.newLogger('text') self.data["quiet"] = 0 @@ -470,4 +474,3 @@ class Configuration(UserDict.UserDict): except ConfigParser.Error: pass try: self.data["allowdeny"] = cfgparser.getboolean(section, "allowdeny") except ConfigParser.Error: pass - diff --git a/linkcheck/Logging.py b/linkcheck/Logging.py index f008c60c..2cd5df92 100644 --- a/linkcheck/Logging.py +++ b/linkcheck/Logging.py @@ -401,7 +401,8 @@ class GMLLogger(StandardLogger): """ def __init__(self, **args): apply(StandardLogger.__init__, (self,), args) - self.nodes = [] + self.nodes = {} + self.nodeid = 0 def init(self): self.starttime = time.time() @@ -413,40 +414,42 @@ class GMLLogger(StandardLogger): self.fd.write("graph [\n directed 1\n") self.fd.flush() - def newUrl(self, urlData): - self.nodes.append(urlData) - def endOfOutput(self, linknumber=-1): - writtenNodes = {} - # write nodes - nodeid = 1 - for node in self.nodes: - if node.url and not writtenNodes.has_key(node.url): - self.fd.write(" node [\n") - self.fd.write(" id %d\n" % nodeid) - self.fd.write(' label "%s"\n' % node.url) - if node.downloadtime: - self.fd.write(" dltime %d\n" % node.downloadtime) - if node.checktime: - self.fd.write(" checktime %d\n" % node.checktime) - self.fd.write(" extern ") - if node.extern: self.fd.write("1") - else: self.fd.write("0") - self.fd.write("\n ]\n") - writtenNodes[node.url] = nodeid - nodeid = nodeid + 1 - # write edges - for node in self.nodes: - if node.url and node.parentName: + def newUrl(self, urlData): + """write one node and all possible edges""" + node = urlData + if node.url and not self.nodes.has_key(node.url): + node.id = self.nodeid + self.nodes[node.url] = node + self.nodeid = self.nodeid + 1 + self.fd.write(" node [\n") + self.fd.write(" id %d\n" % node.id) + self.fd.write(' label "%s"\n' % node.url) + if node.downloadtime: + self.fd.write(" dltime %d\n" % node.downloadtime) + if node.checktime: + self.fd.write(" checktime %d\n" % node.checktime) + self.fd.write(" extern "+(node.extern and "1" or "0")) + self.fd.write("\n ]\n") + self.writeEdges() + + + def writeEdges(self): + """write all edges we can find in the graph in a brute-force + manner. Better would be a mapping of parent urls. + """ + for node in self.nodes.values(): + if self.nodes.has_key(node.parentName): self.fd.write(" edge [\n") self.fd.write(' label "%s"\n' % node.urlName) - self.fd.write(" source %d\n"%writtenNodes[node.parentName]) - self.fd.write(" target %d\n" % writtenNodes[node.url]) - self.fd.write(" valid ") - if node.valid: self.fd.write("1") - else: self.fd.write("0") + self.fd.write(" source %d\n" % self.nodes[node.parentName]) + self.fd.write(" target %d\n" % node.id) + self.fd.write(" valid "+(node.valid and "1" or "0")) self.fd.write("\n ]\n") - # end of output + self.fd.flush() + + + def endOfOutput(self, linknumber=-1): self.fd.write("]\n") self.stoptime = time.time() duration = self.stoptime - self.starttime @@ -464,6 +467,86 @@ class GMLLogger(StandardLogger): self.fd = None +class XMLLogger(StandardLogger): + """XML output mirroring the GML structure. Easy to parse with any XML + tool.""" + def __init__(self, **args): + apply(StandardLogger.__init__, (self,), args) + self.nodes = {} + self.nodeid = 0 + + def init(self): + self.starttime = time.time() + self.fd.write("\n") + self.fd.write("\n\n") + self.fd.write("\n\n") + self.fd.flush() + + def newUrl(self, urlData): + """write one node and all possible edges""" + node = urlData + if node.url and not self.nodes.has_key(node.url): + node.id = self.nodeid + self.nodes[node.url] = node + self.nodeid = self.nodeid + 1 + self.fd.write(" \n") + self.fd.write(" \n" % node.url) + self.fd.write(" \n") + if node.downloadtime: + self.fd.write(" %d\n" \ + % node.downloadtime) + if node.checktime: + self.fd.write(" %d\n" \ + % node.checktime) + self.fd.write(" %d\n" % node.extern) + self.fd.write(" \n") + self.fd.write(" \n") + self.writeEdges() + + def writeEdges(self): + """write all edges we can find in the graph in a brute-force + manner. Better would be a mapping of parent urls. + """ + for node in self.nodes.values(): + if self.nodes.has_key(node.parentName): + self.fd.write(" \n") + self.fd.write(" \n" % node.urlName) + self.fd.write(" \n") + self.fd.write(" %d" % (self.valid and 1 or 0)) + self.fd.write(" \n") + self.fd.write(" \n") + self.fd.flush() + + def endOfOutput(self, linknumber=-1): + self.fd.write("\n\n") + self.stoptime = time.time() + duration = self.stoptime - self.starttime + name = _("seconds") + self.fd.write("") + self.fd.flush() + self.fd = None + + + class SQLLogger(StandardLogger): """ SQL output for PostgreSQL, not tested""" def __init__(self, **args):