From 461b37ac335c71df32e4e7c50441868aa8a8bca3 Mon Sep 17 00:00:00 2001 From: calvin Date: Fri, 28 Apr 2000 11:17:58 +0000 Subject: [PATCH] CSV output git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@76 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- CSV.py | 437 +++++++++++++++++++++++++++++++++++++ INSTALL | 11 +- README | 3 +- debian/changelog | 5 +- linkcheck/Config.py | 30 ++- linkcheck/Logging.py | 30 +++ linkcheck/MailtoUrlData.py | 36 +-- setup.py | 37 +++- test/mail.html | 14 ++ test/test2.html | 5 - 10 files changed, 558 insertions(+), 50 deletions(-) create mode 100644 CSV.py create mode 100644 test/mail.html diff --git a/CSV.py b/CSV.py new file mode 100644 index 00000000..264f74b9 --- /dev/null +++ b/CSV.py @@ -0,0 +1,437 @@ +# +# CSV 0.17 8 June 1999 Copyright ©Laurence Tratt 1998 - 1999 +# e-mail: tratt@dcs.kcl.ac.uk +# home-page: http://eh.org/~laurie/comp/python/csv/index.html +# +# +# +# CSV.py is copyright ©1998 - 1999 by Laurence Tratt +# +# All rights reserved +# +# Permission to use, copy, modify, and distribute this software and its +# documentation for any purpose and without fee is hereby granted, provided that +# the above copyright notice appear in all copies and that both that copyright +# notice and this permission notice appear in supporting documentation. +# +# THE AUTHOR - LAURENCE TRATT - DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN +# NO EVENT SHALL THE AUTHOR FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS ACTION, ARISING OUT OF OR +# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# + + + + + +import re, string, types, UserList + + + + + +################################################################################################### +# +# CSV class +# + + +class CSV(UserList.UserList): + + """ Manage a CSV (comma separated values) file + + The data is held in a list. + + Methods: + __init__() + load() load from file + save() save to file + input() input from string + output() save to string + append() appends one entry + __str__() printable represenation + """ + + + + def __init__(self, separator = ','): + + """ Initialise CVS class instance. + + Arguments: + separator : The field delimiter. Defaults to ',' + """ + + self.separator = separator + + self.data = [] + self.fields__title__have = self.fields__title = None + + + + def load(self, file__data__name, fields__title__have, convert_numbers = 0, separator = None, comments = None): + + """ Load up a CSV file + + Arguments: + file__data__name : The name of the CSV file + fields__title__have : 0 : file has no title fields + otherwise : file has title fields + convert_numbers : 0 : store everything as string's + otherwise : store fields that can be converted + to ints or floats to that Python + type defaults to 0 + separator : The field delimiter (optional) + comments : A list of strings and regular expressions to remove comments + """ + + file__data = open(file__data__name, 'r') + self.input(file__data.read(-1), fields__title__have, convert_numbers, separator or self.separator, comments or ["#"]) + file__data.close() + + + + def save(self, file__data__name, separator = None): + + """ Save data to CSV file. + + Arguments: + file__data__name : The name of the CSV file to save to + separator : The field delimiter (optional) + """ + + file__data = open(file__data__name, 'w') + file__data.write(self.output(separator or self.separator)) + file__data.close() + + + + def input(self, data, fields__title__have, convert_numbers = 0, separator = None, comments = None): + + """ Take wodge of CSV data & convert it into internal format. + + Arguments: + data : A string containing the CSV data + fields__title__have : 0 : file has no title fields + otherwise : file has title fields + convert_numbers : 0 : store everything as string's + otherwise : store fields that can be + converted to ints or + floats to that Python type + defaults to 0 + separator : The field delimiter (Optional) + comments : A list of strings and regular expressions to remove comments + (defaults to ["#"]) + """ + + def line__process(line, convert_numbers, separator): + + fields = [] + line__pos = 0 + + while line__pos < len(line): + + # Skip any space at the beginning of the field (if there should be leading space, + # there should be a " character in the CSV file) + + while line__pos < len(line) and line[line__pos] == " ": + line__pos = line__pos + 1 + + field = "" + quotes__level = 0 + while line__pos < len(line): + + # Skip space at the end of a field (if there is trailing space, it should be + # encompassed by speech marks) + + if quotes__level == 0 and line[line__pos] == " ": + line__pos__temp = line__pos + while line__pos__temp < len(line) and line[line__pos__temp] == " ": + line__pos__temp = line__pos__temp + 1 + if line__pos__temp >= len(line): + break + elif line[line__pos__temp : line__pos__temp + len(separator)] == separator: + line__pos = line__pos__temp + if quotes__level == 0 and line[line__pos : line__pos + len(separator)] == separator: + break + elif line[line__pos] == "\"": + if quotes__level == 0: + quotes__level = 1 + else: + quotes__level = 0 + else: + field = field + line[line__pos] + line__pos = line__pos + 1 + line__pos = line__pos + len(separator) + if convert_numbers: + for char in field: + if char not in "0123456789.-": + fields.append(field) + break + else: + try: + if "." not in field: + fields.append(int(field)) + else: + fields.append(float(field)) + except: + fields.append(field) + else: + fields.append(field) + if line[-len(separator)] == separator: + fields.append(field) + + return fields + + + separator = separator or self.separator + comments = comments or ["#"] + + self.fields__title__have = fields__title__have + + # Remove comments from the input file + + comments__strings = [] + for comment in comments: + if type(comment) == types.InstanceType: + data = comment.sub("", data) + elif type(comment) == types.StringType: + comments__strings.append(comment) + else: + raise Exception("Invalid comment type '" + comment + "'") + + lines = map(string.strip, string.split(data, "\n")) + + # Remove all comments that are of type string + + lines__pos = 0 + while lines__pos < len(lines): + line = lines[lines__pos] + line__pos = 0 + while line__pos < len(line) and line[line__pos] == " ": + line__pos = line__pos + 1 + found_comment = 0 + for comment in comments__strings: + if line__pos + len(comment) < len(line) and line[line__pos : line__pos + len(comment)] == comment: + found_comment = 1 + break + if found_comment: + del lines[lines__pos] + else: + lines__pos = lines__pos + 1 + + # Process the input data + + if fields__title__have: + self.fields__title = line__process(lines[0], convert_numbers, separator) + pos__start = 1 + else: + self.fields__title = [] + pos__start = 0 + self.data = [] + for line in lines[pos__start : ]: + if line != "": + self.data.append(Entry(line__process(line, convert_numbers, separator), self.fields__title)) + + + + def output(self, separator = None): + + """ Convert internal data into CSV string. + + Arguments: + separator : The field delimiter (optional) + + Returns: + String containing CSV data + """ + + separator = separator or self.separator + + + def line__make(entry, separator = separator): + + str = "" + done__any = 0 + for field in entry: + if done__any: + str = str + separator + else: + done__any = 1 + if type(field) != types.StringType: + field = `field` + if len(field) > 0 and (string.find(field, separator) != -1 or (field[0] == " " or field[-1] == " ")): + str = str + "\"" + field + "\"" + else: + str = str + field + + return str + + + if self.fields__title__have: + str = line__make(self.fields__title) + "\n\n" + else: + str = "" + str = str + string.join(map(line__make, self.data), "\n") + "\n" + + return str + + + + def append(self, entry): + + """ Add an entry. """ + + if self.fields__title: + entry.fields__title = self.fields__title + self.data.append(entry) + + + + def field__append(self, func, field__title = None): + + """ Append a field with values specified by a function + + Arguments: + func : Function to be called func(entry) to get the value of the new field + field__title : Name of new field (if applicable) + + """ + + for data__pos in range(len(self)): + entry = self.data[data__pos] + entry.append(func(entry)) + self.data[data__pos] = entry + + if self.fields__title__have: + self.fields__title.append(field__title) + + + + def duplicates__eliminate(self): + + """ Eliminate duplicates (this may result in a reordering of the entries) """ + + # To eliminate duplicates, we first get Python to sort the list for us; then all we have to + # do is to check to see whether consecutive elements are the same, and delete them + # This give us O() * O(n) rather than the more obvious O(n * n) speed algorithm + + # XXX Could be done more efficiently for multiplicate duplicates by deleting a slice of + # similar elements rather than deleting them individually + + self.sort() + data__pos = 1 + entry__last = self.data[0] + while data__pos < len(self.data): + if self.data[data__pos] == entry__last: + del self.data[data__pos] + else: + entry__last = self.data[data__pos] + data__pos = data__pos + 1 + + + + def __str__(self): + + """ Construct a printable representation of the internal data. """ + + columns__width = [] + + # Work out the maximum width of each column + + for column in range(len(self.data[0])): + if self.fields__title__have: + width = len(`self.fields__title[column]`) + else: + width = 0 + for entry in self: + width__possible = len(`entry.data[column]`) + if width__possible > width: + width = width__possible + columns__width.append(width) + + if self.fields__title__have: + str = string.join(map(string.ljust, self.fields__title, columns__width), " ") + "\n\n" + else: + str = "" + for entry in self: + str = str + string.join(map(string.ljust, map(lambda a : (type(a) == types.StringType and [a] or [eval("`a`")])[0], entry.data), columns__width), " ") + "\n" + + return str + + + +################################################################################################### +# +# CSV data entry class +# +# + + +class Entry(UserList.UserList): + + """ CSV data entry, UserList subclass. + + Has the same properties as a list, but has a few dictionary + like properties for easy access of fields if they have titles. + + Methods(Override): + __init__ + __getitem__ + __setitem__ + __delitem__ + """ + + + + def __init__(self, fields, fields__title = None): + + """ Initialise with fields data and field title. + + Arguments: + fields : a list containing the data for each field + of this entry + fields__title : a list with the titles of each field + (an empty list means there are no titles) + """ + + self.data = fields + if fields__title != None: + self.fields__title = fields__title + else: + self.fields__title = [] + + + + def __getitem__(self, x): + + if type(x) == types.IntType: + return self.data[x] + else: + return self.data[self.fields__title.index(x)] + + + + def __setitem__(self, x, item): + + if type(x) == types.IntType: + self.data[x] = item + else: + self.data[self.fields__title.index(x)] = item + + + + def __delitem__(self, x): + + if type(x) == types.IntType: + del self.data[x] + else: + del self.data[self.fields__title.index(x)] + + + + def __str__(self): + + return `self.data` \ No newline at end of file diff --git a/INSTALL b/INSTALL index 935f154e..b5568765 100644 --- a/INSTALL +++ b/INSTALL @@ -9,19 +9,22 @@ Optionally packages: Distutils >= 0.8.1 from http://www.python.org/sigs/distutils-sig/ OpenSSL from http://www.openssl.org + Install with Distutils: If you have the Distutils, run "python setup.py install". -How do you run this? Type the three words without the quotes in -a command shell and press Return. Still clueless? Go away. Install without Distutils: Adjust the sys.path.append argument in the file 'linkchecker' to point to the distribution directory. -Now you can run "python linkchecker" to run LinkChecker. +Now you can type "python linkchecker" (or on Unix: just "./linkchecker") to +run LinkChecker. +Running LinkChecker from any directory: +Unix users can put the "linkchecker" script somewhere in a directory in +their $path. For Windows users, I included a batch script 'linkchecker.bat'. You have to adjust the distribution directory in this script to point to the directory where the 'linkchecker' file is. Now you can copy 'linkchecker.bat' in -a directory in your PATH and run it from anywhere. +a directory in your PATH and run it. diff --git a/README b/README index d7ee0adc..49fbe0f3 100644 --- a/README +++ b/README @@ -5,7 +5,8 @@ With LinkChecker you can check your HTML documents for broken links. Features: o recursive checking o multithreaded -o output can be colored or normal text, HTML, SQL or a GML sitemap graph +o output can be colored or normal text, HTML, SQL, CSV or a GML sitemap + graph o HTTP/1.1, HTTPS, FTP, mailto:, news:, Gopher, Telnet and local file links are supported Javascript links are currently ignored diff --git a/debian/changelog b/debian/changelog index 02b8cd40..177a4eda 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -linkchecker (1.2.3) unstable; urgency=low +linkchecker (1.3.0) unstable; urgency=low * Blacklist output support * typo fix for adjustWinPath @@ -8,8 +8,9 @@ linkchecker (1.2.3) unstable; urgency=low * linkchecker.bat installation support for windows * included test suite in distribution * Improved mailto: link parsing + * CSV output support - -- Bastian Kleineidam Thu, 27 Apr 2000 10:18:52 +0200 + -- Bastian Kleineidam Fri, 28 Apr 2000 12:59:13 +0200 linkchecker (1.2.2) unstable; urgency=low diff --git a/linkcheck/Config.py b/linkcheck/Config.py index a2b4af31..2ccab88f 100644 --- a/linkcheck/Config.py +++ b/linkcheck/Config.py @@ -8,9 +8,10 @@ This module stores import ConfigParser,sys,os,re,UserDict,string from os.path import expanduser,normpath,normcase,join,isfile +from types import StringType import Logging -Version = "1.2.3" +Version = "1.3.0" AppName = "LinkChecker" App = AppName+" "+Version UserAgent = AppName+"/"+Version @@ -32,20 +33,12 @@ Loggers = { "colored": Logging.ColoredLogger, "gml": Logging.GMLLogger, "sql": Logging.SQLLogger, + "csv": Logging.CSVLogger, "blacklist": Logging.BlacklistLogger, } # for easy printing: a comma separated logger list LoggerKeys = reduce(lambda x, y: x+", "+y, Loggers.keys()) -# File output names -FileOutput = { - "text": "linkchecker-out.txt", - "html": "linkchecker-out.html", - "colored": "linkchecker-out.asc", - "gml": "linkchecker-out.gml", - "sql": "linkchecker-out.sql" -} - # debug options DebugDelim = "==========================================================\n" DebugFlag = 0 @@ -86,6 +79,14 @@ class Configuration(UserDict.UserDict): self.data["robotstxt"] = 0 self.data["strict"] = 0 self.data["fileoutput"] = [] + self.data["fileoutputnames"] = { + "text": "linkchecker-out.txt", + "html": "linkchecker-out.html", + "colored": "linkchecker-out.asc", + "gml": "linkchecker-out.gml", + "sql": "linkchecker-out.sql", + "csv": "linkchecker-out.csv", + } self.data["quiet"] = 0 self.data["warningregex"] = None self.data["nntpserver"] = os.environ.get("NNTP_SERVER",None) @@ -338,13 +339,20 @@ class Configuration(UserDict.UserDict): except ConfigParser.Error: pass try: self.data["warnings"] = cfgparser.getboolean(section, "warnings") except ConfigParser.Error: pass + try: + filenames = eval(cfgparser.get(section, "fileoutputnames")) + for key in filenames.keys(): + if self.data["fileoutputnames"].has_key(key) and \ + type(filenames[key]) == StringType: + self.data["fileoutputnames"] = filenames[key] + except ConfigParser.Error: pass try: filelist = string.split(cfgparser.get(section, "fileoutput")) for arg in filelist: # no file output for the blacklist Logger if Loggers.has_key(arg) and arg != "blacklist": self.data["fileoutput"].append(Loggers[arg]( - open(FileOutput[arg], "w"))) + open(self.data["fileoutputnames"][arg], "w"))) except ConfigParser.Error: pass section="checking" diff --git a/linkcheck/Logging.py b/linkcheck/Logging.py index 8fb7d728..4e53c95c 100644 --- a/linkcheck/Logging.py +++ b/linkcheck/Logging.py @@ -425,3 +425,33 @@ class BlacklistLogger: if self.blacklist[url] is None: fd.write(url+"\n") + +class CSVLogger(StandardLogger): + """ CSV output. CSV consists of one line per entry. Entries are + separated by a semicolon. + """ + def init(self): + self.fd.write("# created by "+Config.AppName+" at "+ + _strtime(time.time())+ + "\n# you get "+Config.AppName+" at "+Config.Url+ + "\n# write comments and bugs to "+Config.Email+"\n\n") + self.fd.flush() + + def newUrl(self, urlData): + self.fd.write(`urlData.urlName`+';'+ + `urlData.recursionLevel`+';'+ + `urlData.parentName`+';'+ + `urlData.baseRef`+';'+ + `urlData.errorString`+';'+ + `urlData.validString`+';'+ + `urlData.warningString`+';'+ + `urlData.infoString`+';'+ + `urlData.valid`+';'+ + `urlData.url`+';'+ + `urlData.line`+';'+ + `urlData.cached`+'\n') + self.fd.flush() + + def endOfOutput(self): + self.fd = None + diff --git a/linkcheck/MailtoUrlData.py b/linkcheck/MailtoUrlData.py index ae88bad8..071dcc96 100644 --- a/linkcheck/MailtoUrlData.py +++ b/linkcheck/MailtoUrlData.py @@ -3,29 +3,31 @@ from HostCheckingUrlData import HostCheckingUrlData from smtplib import SMTP from UrlData import LinkCheckerException -mailto_re = re.compile(r"^mailto:" - r"(['\-\w.]+@[\-\w.]+(\?.+)?|" - r"[\w\s]+<['\-\w.]+@[\-\w.]+(\?.+)?>)$") +# regular expression strings +tag_str = r"^mailto:" +adress_str = r"([a-zA-Z]['\-\w.]*)@([\w\-]+(\.[\w\-]+)*))" +complete_adress_str = "("+adress_str+"|[\w\-\s]*<"+adress_str+">)" +suffix_str = r"(\?.+)?" +mailto_str = tag_str+complete_adress_str+\ + "(\s*,"+complete_adress_str+")*"+suffix_str + +# compiled +mailto_re = re.compile(mailto_str) +adress_re = re.compile(adress_str) + class MailtoUrlData(HostCheckingUrlData): "Url link with mailto scheme" def buildUrl(self): HostCheckingUrlData.buildUrl(self) - if not mailto_re.match(self.urlName): + mo = mailto_re.match(self.urlName) + if not mo: raise LinkCheckerException, "Illegal mailto link syntax" - self.host = self.urlName[7:] - i = string.find(self.host, "<") - j = string.find(self.host, ">") - if i!=-1 and j!=-1 and i + + + +1 +2 +3 +4 +5 + +3 +6 + + diff --git a/test/test2.html b/test/test2.html index be9fba4c..ec6734f5 100644 --- a/test/test2.html +++ b/test/test2.html @@ -9,11 +9,6 @@ - - - - -