CSV output

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@76 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2000-04-28 11:17:58 +00:00
parent 7094fe3ec0
commit 461b37ac33
10 changed files with 558 additions and 50 deletions

437
CSV.py Normal file
View file

@ -0,0 +1,437 @@
#
# CSV 0.17 8 June 1999 Copyright ©Laurence Tratt 1998 - 1999
# e-mail: tratt@dcs.kcl.ac.uk
# home-page: http://eh.org/~laurie/comp/python/csv/index.html
#
#
#
# CSV.py is copyright ©1998 - 1999 by Laurence Tratt
#
# All rights reserved
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose and without fee is hereby granted, provided that
# the above copyright notice appear in all copies and that both that copyright
# notice and this permission notice appear in supporting documentation.
#
# THE AUTHOR - LAURENCE TRATT - DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
# NO EVENT SHALL THE AUTHOR FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS ACTION, ARISING OUT OF OR
# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
import re, string, types, UserList
###################################################################################################
#
# CSV class
#
class CSV(UserList.UserList):
""" Manage a CSV (comma separated values) file
The data is held in a list.
Methods:
__init__()
load() load from file
save() save to file
input() input from string
output() save to string
append() appends one entry
__str__() printable represenation
"""
def __init__(self, separator = ','):
""" Initialise CVS class instance.
Arguments:
separator : The field delimiter. Defaults to ','
"""
self.separator = separator
self.data = []
self.fields__title__have = self.fields__title = None
def load(self, file__data__name, fields__title__have, convert_numbers = 0, separator = None, comments = None):
""" Load up a CSV file
Arguments:
file__data__name : The name of the CSV file
fields__title__have : 0 : file has no title fields
otherwise : file has title fields
convert_numbers : 0 : store everything as string's
otherwise : store fields that can be converted
to ints or floats to that Python
type defaults to 0
separator : The field delimiter (optional)
comments : A list of strings and regular expressions to remove comments
"""
file__data = open(file__data__name, 'r')
self.input(file__data.read(-1), fields__title__have, convert_numbers, separator or self.separator, comments or ["#"])
file__data.close()
def save(self, file__data__name, separator = None):
""" Save data to CSV file.
Arguments:
file__data__name : The name of the CSV file to save to
separator : The field delimiter (optional)
"""
file__data = open(file__data__name, 'w')
file__data.write(self.output(separator or self.separator))
file__data.close()
def input(self, data, fields__title__have, convert_numbers = 0, separator = None, comments = None):
""" Take wodge of CSV data & convert it into internal format.
Arguments:
data : A string containing the CSV data
fields__title__have : 0 : file has no title fields
otherwise : file has title fields
convert_numbers : 0 : store everything as string's
otherwise : store fields that can be
converted to ints or
floats to that Python type
defaults to 0
separator : The field delimiter (Optional)
comments : A list of strings and regular expressions to remove comments
(defaults to ["#"])
"""
def line__process(line, convert_numbers, separator):
fields = []
line__pos = 0
while line__pos < len(line):
# Skip any space at the beginning of the field (if there should be leading space,
# there should be a " character in the CSV file)
while line__pos < len(line) and line[line__pos] == " ":
line__pos = line__pos + 1
field = ""
quotes__level = 0
while line__pos < len(line):
# Skip space at the end of a field (if there is trailing space, it should be
# encompassed by speech marks)
if quotes__level == 0 and line[line__pos] == " ":
line__pos__temp = line__pos
while line__pos__temp < len(line) and line[line__pos__temp] == " ":
line__pos__temp = line__pos__temp + 1
if line__pos__temp >= len(line):
break
elif line[line__pos__temp : line__pos__temp + len(separator)] == separator:
line__pos = line__pos__temp
if quotes__level == 0 and line[line__pos : line__pos + len(separator)] == separator:
break
elif line[line__pos] == "\"":
if quotes__level == 0:
quotes__level = 1
else:
quotes__level = 0
else:
field = field + line[line__pos]
line__pos = line__pos + 1
line__pos = line__pos + len(separator)
if convert_numbers:
for char in field:
if char not in "0123456789.-":
fields.append(field)
break
else:
try:
if "." not in field:
fields.append(int(field))
else:
fields.append(float(field))
except:
fields.append(field)
else:
fields.append(field)
if line[-len(separator)] == separator:
fields.append(field)
return fields
separator = separator or self.separator
comments = comments or ["#"]
self.fields__title__have = fields__title__have
# Remove comments from the input file
comments__strings = []
for comment in comments:
if type(comment) == types.InstanceType:
data = comment.sub("", data)
elif type(comment) == types.StringType:
comments__strings.append(comment)
else:
raise Exception("Invalid comment type '" + comment + "'")
lines = map(string.strip, string.split(data, "\n"))
# Remove all comments that are of type string
lines__pos = 0
while lines__pos < len(lines):
line = lines[lines__pos]
line__pos = 0
while line__pos < len(line) and line[line__pos] == " ":
line__pos = line__pos + 1
found_comment = 0
for comment in comments__strings:
if line__pos + len(comment) < len(line) and line[line__pos : line__pos + len(comment)] == comment:
found_comment = 1
break
if found_comment:
del lines[lines__pos]
else:
lines__pos = lines__pos + 1
# Process the input data
if fields__title__have:
self.fields__title = line__process(lines[0], convert_numbers, separator)
pos__start = 1
else:
self.fields__title = []
pos__start = 0
self.data = []
for line in lines[pos__start : ]:
if line != "":
self.data.append(Entry(line__process(line, convert_numbers, separator), self.fields__title))
def output(self, separator = None):
""" Convert internal data into CSV string.
Arguments:
separator : The field delimiter (optional)
Returns:
String containing CSV data
"""
separator = separator or self.separator
def line__make(entry, separator = separator):
str = ""
done__any = 0
for field in entry:
if done__any:
str = str + separator
else:
done__any = 1
if type(field) != types.StringType:
field = `field`
if len(field) > 0 and (string.find(field, separator) != -1 or (field[0] == " " or field[-1] == " ")):
str = str + "\"" + field + "\""
else:
str = str + field
return str
if self.fields__title__have:
str = line__make(self.fields__title) + "\n\n"
else:
str = ""
str = str + string.join(map(line__make, self.data), "\n") + "\n"
return str
def append(self, entry):
""" Add an entry. """
if self.fields__title:
entry.fields__title = self.fields__title
self.data.append(entry)
def field__append(self, func, field__title = None):
""" Append a field with values specified by a function
Arguments:
func : Function to be called func(entry) to get the value of the new field
field__title : Name of new field (if applicable)
"""
for data__pos in range(len(self)):
entry = self.data[data__pos]
entry.append(func(entry))
self.data[data__pos] = entry
if self.fields__title__have:
self.fields__title.append(field__title)
def duplicates__eliminate(self):
""" Eliminate duplicates (this may result in a reordering of the entries) """
# To eliminate duplicates, we first get Python to sort the list for us; then all we have to
# do is to check to see whether consecutive elements are the same, and delete them
# This give us O(<sort>) * O(n) rather than the more obvious O(n * n) speed algorithm
# XXX Could be done more efficiently for multiplicate duplicates by deleting a slice of
# similar elements rather than deleting them individually
self.sort()
data__pos = 1
entry__last = self.data[0]
while data__pos < len(self.data):
if self.data[data__pos] == entry__last:
del self.data[data__pos]
else:
entry__last = self.data[data__pos]
data__pos = data__pos + 1
def __str__(self):
""" Construct a printable representation of the internal data. """
columns__width = []
# Work out the maximum width of each column
for column in range(len(self.data[0])):
if self.fields__title__have:
width = len(`self.fields__title[column]`)
else:
width = 0
for entry in self:
width__possible = len(`entry.data[column]`)
if width__possible > width:
width = width__possible
columns__width.append(width)
if self.fields__title__have:
str = string.join(map(string.ljust, self.fields__title, columns__width), " ") + "\n\n"
else:
str = ""
for entry in self:
str = str + string.join(map(string.ljust, map(lambda a : (type(a) == types.StringType and [a] or [eval("`a`")])[0], entry.data), columns__width), " ") + "\n"
return str
###################################################################################################
#
# CSV data entry class
#
#
class Entry(UserList.UserList):
""" CSV data entry, UserList subclass.
Has the same properties as a list, but has a few dictionary
like properties for easy access of fields if they have titles.
Methods(Override):
__init__
__getitem__
__setitem__
__delitem__
"""
def __init__(self, fields, fields__title = None):
""" Initialise with fields data and field title.
Arguments:
fields : a list containing the data for each field
of this entry
fields__title : a list with the titles of each field
(an empty list means there are no titles)
"""
self.data = fields
if fields__title != None:
self.fields__title = fields__title
else:
self.fields__title = []
def __getitem__(self, x):
if type(x) == types.IntType:
return self.data[x]
else:
return self.data[self.fields__title.index(x)]
def __setitem__(self, x, item):
if type(x) == types.IntType:
self.data[x] = item
else:
self.data[self.fields__title.index(x)] = item
def __delitem__(self, x):
if type(x) == types.IntType:
del self.data[x]
else:
del self.data[self.fields__title.index(x)]
def __str__(self):
return `self.data`

11
INSTALL
View file

@ -9,19 +9,22 @@ Optionally packages:
Distutils >= 0.8.1 from http://www.python.org/sigs/distutils-sig/
OpenSSL from http://www.openssl.org
Install with Distutils:
If you have the Distutils, run "python setup.py install".
How do you run this? Type the three words without the quotes in
a command shell and press Return. Still clueless? Go away.
Install without Distutils:
Adjust the sys.path.append argument in the file 'linkchecker' to point
to the distribution directory.
Now you can run "python linkchecker" to run LinkChecker.
Now you can type "python linkchecker" (or on Unix: just "./linkchecker") to
run LinkChecker.
Running LinkChecker from any directory:
Unix users can put the "linkchecker" script somewhere in a directory in
their $path.
For Windows users, I included a batch script 'linkchecker.bat'. You have to
adjust the distribution directory in this script to point to the directory
where the 'linkchecker' file is. Now you can copy 'linkchecker.bat' in
a directory in your PATH and run it from anywhere.
a directory in your PATH and run it.

3
README
View file

@ -5,7 +5,8 @@ With LinkChecker you can check your HTML documents for broken links.
Features:
o recursive checking
o multithreaded
o output can be colored or normal text, HTML, SQL or a GML sitemap graph
o output can be colored or normal text, HTML, SQL, CSV or a GML sitemap
graph
o HTTP/1.1, HTTPS, FTP, mailto:, news:, Gopher, Telnet and local file links
are supported
Javascript links are currently ignored

5
debian/changelog vendored
View file

@ -1,4 +1,4 @@
linkchecker (1.2.3) unstable; urgency=low
linkchecker (1.3.0) unstable; urgency=low
* Blacklist output support
* typo fix for adjustWinPath
@ -8,8 +8,9 @@ linkchecker (1.2.3) unstable; urgency=low
* linkchecker.bat installation support for windows
* included test suite in distribution
* Improved mailto: link parsing
* CSV output support
-- Bastian Kleineidam <calvin@users.sourceforge.net> Thu, 27 Apr 2000 10:18:52 +0200
-- Bastian Kleineidam <calvin@users.sourceforge.net> Fri, 28 Apr 2000 12:59:13 +0200
linkchecker (1.2.2) unstable; urgency=low

View file

@ -8,9 +8,10 @@ This module stores
import ConfigParser,sys,os,re,UserDict,string
from os.path import expanduser,normpath,normcase,join,isfile
from types import StringType
import Logging
Version = "1.2.3"
Version = "1.3.0"
AppName = "LinkChecker"
App = AppName+" "+Version
UserAgent = AppName+"/"+Version
@ -32,20 +33,12 @@ Loggers = {
"colored": Logging.ColoredLogger,
"gml": Logging.GMLLogger,
"sql": Logging.SQLLogger,
"csv": Logging.CSVLogger,
"blacklist": Logging.BlacklistLogger,
}
# for easy printing: a comma separated logger list
LoggerKeys = reduce(lambda x, y: x+", "+y, Loggers.keys())
# File output names
FileOutput = {
"text": "linkchecker-out.txt",
"html": "linkchecker-out.html",
"colored": "linkchecker-out.asc",
"gml": "linkchecker-out.gml",
"sql": "linkchecker-out.sql"
}
# debug options
DebugDelim = "==========================================================\n"
DebugFlag = 0
@ -86,6 +79,14 @@ class Configuration(UserDict.UserDict):
self.data["robotstxt"] = 0
self.data["strict"] = 0
self.data["fileoutput"] = []
self.data["fileoutputnames"] = {
"text": "linkchecker-out.txt",
"html": "linkchecker-out.html",
"colored": "linkchecker-out.asc",
"gml": "linkchecker-out.gml",
"sql": "linkchecker-out.sql",
"csv": "linkchecker-out.csv",
}
self.data["quiet"] = 0
self.data["warningregex"] = None
self.data["nntpserver"] = os.environ.get("NNTP_SERVER",None)
@ -338,13 +339,20 @@ class Configuration(UserDict.UserDict):
except ConfigParser.Error: pass
try: self.data["warnings"] = cfgparser.getboolean(section, "warnings")
except ConfigParser.Error: pass
try:
filenames = eval(cfgparser.get(section, "fileoutputnames"))
for key in filenames.keys():
if self.data["fileoutputnames"].has_key(key) and \
type(filenames[key]) == StringType:
self.data["fileoutputnames"] = filenames[key]
except ConfigParser.Error: pass
try:
filelist = string.split(cfgparser.get(section, "fileoutput"))
for arg in filelist:
# no file output for the blacklist Logger
if Loggers.has_key(arg) and arg != "blacklist":
self.data["fileoutput"].append(Loggers[arg](
open(FileOutput[arg], "w")))
open(self.data["fileoutputnames"][arg], "w")))
except ConfigParser.Error: pass
section="checking"

View file

@ -425,3 +425,33 @@ class BlacklistLogger:
if self.blacklist[url] is None:
fd.write(url+"\n")
class CSVLogger(StandardLogger):
""" CSV output. CSV consists of one line per entry. Entries are
separated by a semicolon.
"""
def init(self):
self.fd.write("# created by "+Config.AppName+" at "+
_strtime(time.time())+
"\n# you get "+Config.AppName+" at "+Config.Url+
"\n# write comments and bugs to "+Config.Email+"\n\n")
self.fd.flush()
def newUrl(self, urlData):
self.fd.write(`urlData.urlName`+';'+
`urlData.recursionLevel`+';'+
`urlData.parentName`+';'+
`urlData.baseRef`+';'+
`urlData.errorString`+';'+
`urlData.validString`+';'+
`urlData.warningString`+';'+
`urlData.infoString`+';'+
`urlData.valid`+';'+
`urlData.url`+';'+
`urlData.line`+';'+
`urlData.cached`+'\n')
self.fd.flush()
def endOfOutput(self):
self.fd = None

View file

@ -3,29 +3,31 @@ from HostCheckingUrlData import HostCheckingUrlData
from smtplib import SMTP
from UrlData import LinkCheckerException
mailto_re = re.compile(r"^mailto:"
r"(['\-\w.]+@[\-\w.]+(\?.+)?|"
r"[\w\s]+<['\-\w.]+@[\-\w.]+(\?.+)?>)$")
# regular expression strings
tag_str = r"^mailto:"
adress_str = r"([a-zA-Z]['\-\w.]*)@([\w\-]+(\.[\w\-]+)*))"
complete_adress_str = "("+adress_str+"|[\w\-\s]*<"+adress_str+">)"
suffix_str = r"(\?.+)?"
mailto_str = tag_str+complete_adress_str+\
"(\s*,"+complete_adress_str+")*"+suffix_str
# compiled
mailto_re = re.compile(mailto_str)
adress_re = re.compile(adress_str)
class MailtoUrlData(HostCheckingUrlData):
"Url link with mailto scheme"
def buildUrl(self):
HostCheckingUrlData.buildUrl(self)
if not mailto_re.match(self.urlName):
mo = mailto_re.match(self.urlName)
if not mo:
raise LinkCheckerException, "Illegal mailto link syntax"
self.host = self.urlName[7:]
i = string.find(self.host, "<")
j = string.find(self.host, ">")
if i!=-1 and j!=-1 and i<j:
self.host = self.host[i+1:j]
i = string.find(self.host, "@")
self.user = self.host[:i]
self.host = self.host[(i+1):]
i = string.find(self.host, "?")
if i!=-1:
self.host = self.host[:i]
self.host = string.lower(self.host)
# do not lower the user name
self.adresses = re.findall(adress_re, self.urlName)
Config.debug(str(self.adresses))
raise Exception, "Nix"
self.host = None
self.user = None
def checkConnection(self, config):
DNS.ParseResolvConf()

View file

@ -4,9 +4,15 @@ from distutils.dist import Distribution
from Template import Template
import sys
# Hack for linkchecker.bat
# Autodetect the existence of an SSL library (this is pretty shitty)
# Autodetect Windows platforms to include the linkchecker.bat script
class LCDistribution(Distribution):
def run_commands (self):
if self.has_ssl():
self.ext_modules = [('ssl', {'sources': ['ssl.c'],
'include_dirs': ['/usr/include/openssl'],
'library_dirs': ['/usr/lib'],
'libs': ['ssl']})]
if sys.platform=='win32':
inst = self.find_command_obj("install")
inst.ensure_ready()
@ -18,23 +24,34 @@ class LCDistribution(Distribution):
for cmd in self.commands:
self.run_command (cmd)
def has_ssl(self):
return 1
setup (name = "linkchecker",
version = "1.2.3",
version = "1.3.0",
description = "check links of HTML pages",
author = "Bastian Kleineidam",
author_email = "calvin@users.sourceforge.net",
url = "http://linkchecker.sourceforge.net/",
licence = "GPL",
long_description =
"""With LinkChecker you can check your HTML documents for broken links.
Features:
o recursive checking
o multithreaded
o output can be colored or normal text, HTML, SQL, CSV or a GML sitemap
graph
o HTTP/1.1, HTTPS, FTP, mailto:, news:, Gopher, Telnet and local file links
are supported.
Javascript links are currently ignored
o restrict link checking to your local domain
o HTTP proxy support
o give username/password for HTTP and FTP authorization
o robots.txt exclusion protocol support
"""
distclass = LCDistribution,
packages = ['','DNS','linkcheck'],
# uncomment ext_modules to enable HTTPS support
# you must have an SSL library and the Python header
# files installed
ext_modules = [('ssl', {'sources': ['ssl.c'],
'include_dirs': ['/usr/include/openssl'],
'library_dirs': ['/usr/lib'],
'libs': ['ssl']})],
scripts = ['linkchecker'],
)
)

14
test/mail.html Normal file
View file

@ -0,0 +1,14 @@
<!-- extra mail checking -->
<html><head></head>
<body>
<!-- legal -->
<a href=mailto:calvin@localhost?subject=Hallo!%%&to=Pfuscher>1</a>
<a href="mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla">2</a>
<a href="mailto:Bastian Kleineidam <calvin@host1>?foo=bar">3</a>
<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>">4</a>
<a href="mailto:o'hara@doctor.fraggle-hause?subject=äöü">5</a>
<!-- illegal -->
<a href="mailto:Bastian Kleineidam <calvin@host1?foo=bar>">3</a>
<a href="mailto:">6</a>
</body>
</html>

View file

@ -9,11 +9,6 @@
<a href="test1.html">
<a href="test1.html#isnix">
<a href="test1.html#iswas">
<a href=mailto:calvin@localhost?subject=Hallo!%%&to=Pfuscher>
<a href="mailto:Bastian Kleineidam <calvin@host1?foo=bar>">
<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>">
<a href="mailto:o'hara@doctor.fraggle-hause?subject=äöü">
<a href="mailto:">
<a href="telnet:localhost">
<a href="telnet:">
<a href="ftp:/treasure.calvinsplayground.de/pub">