CSV output

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@76 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-08 06:34:50 +00:00 · 2000-04-28 11:17:58 +00:00 · 2000-04-28 11:17:58 +00:00 · 461b37ac33
commit 461b37ac33
parent 7094fe3ec0
10 changed files with 558 additions and 50 deletions
--- a/CSV.py
+++ b/CSV.py
@ -0,0 +1,437 @@
+#
+# CSV 0.17  8 June 1999    Copyright ©Laurence Tratt 1998 - 1999
+# e-mail: tratt@dcs.kcl.ac.uk
+# home-page: http://eh.org/~laurie/comp/python/csv/index.html
+#
+#
+#
+# CSV.py is copyright ©1998 - 1999 by Laurence Tratt
+#
+# All rights reserved
+#
+# Permission to use, copy, modify, and distribute this software and its
+# documentation for any purpose and without fee is hereby granted, provided that
+# the above copyright notice appear in all copies and that both that copyright
+# notice and this permission notice appear in supporting documentation.
+#
+# THE AUTHOR - LAURENCE TRATT - DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
+# NO EVENT SHALL THE AUTHOR FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS ACTION, ARISING OUT OF OR
+# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+
+
+
+
+
+import re, string, types, UserList
+
+
+
+
+
+###################################################################################################
+#
+# CSV class
+#
+
+
+class CSV(UserList.UserList):
+
+	""" Manage a CSV (comma separated values) file
+
+        The data is held in a list.
+    
+        Methods:
+          __init__()
+          load()    load from file
+          save()    save to file
+          input()   input from string
+          output()  save to string
+          append()  appends one entry
+          __str__() printable represenation
+	"""
+
+
+
+	def __init__(self, separator = ','):
+
+		""" Initialise CVS class instance.
+
+            Arguments:
+              separator        : The field delimiter. Defaults to ','
+        """
+
+		self.separator = separator
+
+		self.data = []
+		self.fields__title__have = self.fields__title = None
+
+
+
+	def load(self, file__data__name, fields__title__have, convert_numbers = 0, separator = None, comments = None):
+
+		""" Load up a CSV file
+
+            Arguments:
+              file__data__name    : The name of the CSV file
+              fields__title__have : 0         : file has no title fields
+                                    otherwise : file has title fields
+              convert_numbers     : 0         : store everything as string's
+                                    otherwise : store fields that can be converted
+                                                to ints or floats to that Python
+                                                type defaults to 0
+              separator           : The field delimiter (optional)
+              comments            : A list of strings and regular expressions to remove comments
+		"""
+
+		file__data = open(file__data__name, 'r')
+		self.input(file__data.read(-1), fields__title__have, convert_numbers, separator or self.separator, comments or ["#"])
+		file__data.close()
+
+
+
+	def save(self, file__data__name, separator = None):
+
+		""" Save data to CSV file.
+
+            Arguments:
+              file__data__name : The name of the CSV file to save to
+              separator        : The field delimiter (optional)
+		"""
+
+		file__data = open(file__data__name, 'w')
+		file__data.write(self.output(separator or self.separator))
+		file__data.close()
+
+
+
+	def input(self, data, fields__title__have, convert_numbers = 0, separator = None, comments = None):
+
+		""" Take wodge of CSV data & convert it into internal format.
+
+            Arguments:
+              data                : A string containing the CSV data
+              fields__title__have : 0         : file has no title fields
+                                    otherwise : file has title fields
+              convert_numbers     : 0         : store everything as string's
+                                    otherwise : store fields that can be
+                                                converted to ints or
+                                                floats to that Python type
+                                                defaults to 0
+              separator           : The field delimiter (Optional)
+              comments            : A list of strings and regular expressions to remove comments
+                                      (defaults to ["#"])
+		"""
+
+		def line__process(line, convert_numbers, separator):
+
+			fields = []
+			line__pos = 0
+				
+			while line__pos < len(line):
+
+				# Skip any space at the beginning of the field (if there should be leading space,
+				#   there should be a " character in the CSV file)
+
+				while line__pos < len(line) and line[line__pos] == " ":
+					line__pos = line__pos + 1
+
+				field = ""
+				quotes__level = 0
+				while line__pos < len(line):
+
+					# Skip space at the end of a field (if there is trailing space, it should be
+					#   encompassed by speech marks)
+
+					if quotes__level == 0 and line[line__pos] == " ":
+						line__pos__temp = line__pos
+						while line__pos__temp < len(line) and line[line__pos__temp] == " ":
+							line__pos__temp = line__pos__temp + 1
+						if line__pos__temp >= len(line):
+							break
+						elif line[line__pos__temp : line__pos__temp + len(separator)] == separator:
+							line__pos = line__pos__temp
+					if quotes__level == 0 and line[line__pos : line__pos + len(separator)] == separator:
+						break
+					elif line[line__pos] == "\"":
+						if quotes__level == 0:
+							quotes__level = 1
+						else:
+							quotes__level = 0
+					else:
+						field = field + line[line__pos]
+					line__pos = line__pos + 1
+				line__pos = line__pos + len(separator)
+				if convert_numbers:
+					for char in field:
+						if char not in "0123456789.-":
+							fields.append(field)
+							break
+					else:
+						try:
+							if "." not in field:
+								fields.append(int(field))
+							else:
+								fields.append(float(field))
+						except:
+							fields.append(field)
+				else:
+					fields.append(field)
+			if line[-len(separator)] == separator:
+				fields.append(field)
+
+			return fields
+
+
+		separator = separator or self.separator
+		comments = comments or ["#"]
+
+		self.fields__title__have = fields__title__have
+
+		# Remove comments from the input file
+
+		comments__strings = []
+		for comment in comments:
+			if type(comment) == types.InstanceType:
+				data = comment.sub("", data)
+			elif type(comment) == types.StringType:
+				comments__strings.append(comment)
+			else:
+				raise Exception("Invalid comment type '" + comment + "'")
+
+		lines = map(string.strip, string.split(data, "\n"))
+
+		# Remove all comments that are of type string
+
+		lines__pos = 0
+		while lines__pos < len(lines):
+			line = lines[lines__pos]
+			line__pos = 0
+			while line__pos < len(line) and line[line__pos] == " ":
+				line__pos = line__pos + 1
+			found_comment = 0
+			for comment in comments__strings:
+				if line__pos + len(comment) < len(line) and line[line__pos : line__pos + len(comment)] == comment:
+					found_comment = 1
+					break
+			if found_comment:
+				del lines[lines__pos]
+			else:
+				lines__pos = lines__pos + 1
+
+		# Process the input data
+
+		if fields__title__have:
+			self.fields__title = line__process(lines[0], convert_numbers, separator)
+			pos__start = 1
+		else:
+			self.fields__title = []
+			pos__start = 0
+		self.data = []
+		for line in lines[pos__start : ]:
+			if line != "":
+				self.data.append(Entry(line__process(line, convert_numbers, separator), self.fields__title))
+
+
+
+	def output(self, separator = None):
+
+		""" Convert internal data into CSV string.
+
+            Arguments:
+              separator        : The field delimiter (optional)
+
+            Returns:
+              String containing CSV data
+		"""
+
+		separator = separator or self.separator
+
+
+		def line__make(entry, separator = separator):
+
+			str = ""
+			done__any = 0
+			for field in entry:
+				if done__any:
+					str = str + separator
+				else:
+					done__any = 1
+				if type(field) != types.StringType:
+					field = `field`
+				if len(field) > 0 and (string.find(field, separator) != -1 or (field[0] == " " or field[-1] == " ")):
+					str = str + "\"" + field + "\""
+				else:
+					str = str + field
+
+			return str
+
+
+		if self.fields__title__have:
+			str = line__make(self.fields__title) + "\n\n"
+		else:
+			str = ""
+		str = str + string.join(map(line__make, self.data), "\n") + "\n"
+
+		return str
+
+
+
+	def append(self, entry):
+	
+		""" Add an entry. """
+
+		if self.fields__title:
+			entry.fields__title = self.fields__title
+		self.data.append(entry)
+
+
+
+	def field__append(self, func, field__title = None):
+
+		""" Append a field with values specified by a function
+
+            Arguments:
+              func         : Function to be called func(entry) to get the value of the new field
+              field__title : Name of new field (if applicable)
+
+        """
+
+		for data__pos in range(len(self)):
+			entry = self.data[data__pos]
+			entry.append(func(entry))
+			self.data[data__pos] = entry
+
+		if self.fields__title__have:
+			self.fields__title.append(field__title)
+
+
+
+	def duplicates__eliminate(self):
+
+		""" Eliminate duplicates (this may result in a reordering of the entries) """
+
+		# To eliminate duplicates, we first get Python to sort the list for us; then all we have to
+		#   do is to check to see whether consecutive elements are the same, and delete them
+		# This give us O(<sort>) * O(n) rather than the more obvious O(n * n) speed algorithm
+
+		# XXX Could be done more efficiently for multiplicate duplicates by deleting a slice of
+		#       similar elements rather than deleting them individually
+
+		self.sort()
+		data__pos = 1
+		entry__last = self.data[0]
+		while data__pos < len(self.data):
+			if self.data[data__pos] == entry__last:
+				del self.data[data__pos]
+			else:
+				entry__last = self.data[data__pos]
+				data__pos = data__pos + 1
+
+
+
+	def __str__(self):
+
+		""" Construct a printable representation of the internal data. """
+
+		columns__width = []
+
+		# Work out the maximum width of each column
+
+		for column in range(len(self.data[0])):
+			if self.fields__title__have:
+				width = len(`self.fields__title[column]`)
+			else:
+				width = 0
+			for entry in self:
+				width__possible = len(`entry.data[column]`)
+				if width__possible > width:
+					width = width__possible
+			columns__width.append(width)
+
+		if self.fields__title__have:
+			str = string.join(map(string.ljust, self.fields__title, columns__width), "  ") + "\n\n"
+		else:
+			str = ""
+		for entry in self:
+			str = str + string.join(map(string.ljust, map(lambda a : (type(a) == types.StringType and [a] or [eval("`a`")])[0], entry.data), columns__width), "  ") + "\n"
+
+		return str
+
+
+
+###################################################################################################
+#
+# CSV data entry class
+#
+#
+
+
+class Entry(UserList.UserList):
+
+	""" CSV data entry, UserList subclass.
+
+        Has the same properties as a list, but has a few dictionary
+        like properties for easy access of fields if they have titles.
+    
+        Methods(Override):
+          __init__
+          __getitem__
+          __setitem__
+          __delitem__
+	"""
+
+
+
+	def __init__(self, fields, fields__title = None):
+	
+		""" Initialise with fields data and field title.
+
+            Arguments:
+              fields        : a list containing the data for each field
+                              of this entry
+              fields__title : a list with the titles of each field
+                              (an empty list means there are no titles)
+		"""
+
+		self.data = fields
+		if fields__title != None:
+			self.fields__title = fields__title
+		else:
+			self.fields__title = []
+
+
+
+	def __getitem__(self, x):
+
+		if type(x) == types.IntType:
+			return self.data[x]
+		else:
+			return self.data[self.fields__title.index(x)]
+
+
+
+	def __setitem__(self, x, item):
+
+		if type(x) == types.IntType:
+			self.data[x] = item
+		else:
+			self.data[self.fields__title.index(x)] = item
+
+
+
+	def __delitem__(self, x):
+
+		if type(x) == types.IntType:
+			del self.data[x]
+		else:
+			del self.data[self.fields__title.index(x)]
+
+
+
+	def __str__(self):
+
+		return `self.data`
--- a/11
+++ b/11
@ -9,19 +9,22 @@ Optionally packages:
 Distutils >= 0.8.1 from http://www.python.org/sigs/distutils-sig/ 
 OpenSSL from http://www.openssl.org

+
 Install with Distutils:
 If you have the Distutils, run "python setup.py install".
-How do you run this? Type the three words without the quotes in
-a command shell and press Return. Still clueless? Go away.


 Install without Distutils:
 Adjust the sys.path.append argument in the file 'linkchecker' to point 
 to the distribution directory.
-Now you can run "python linkchecker" to run LinkChecker.
+Now you can type "python linkchecker" (or on Unix: just "./linkchecker") to 
+run LinkChecker.

+Running LinkChecker from any directory:
+Unix users can put the "linkchecker" script somewhere in a directory in 
+their $path.
 For Windows users, I included a batch script 'linkchecker.bat'. You have to
 adjust the distribution directory in this script to point to the directory
 where the 'linkchecker' file is. Now you can copy 'linkchecker.bat' in
-a directory in your PATH and run it from anywhere.
+a directory in your PATH and run it.

--- a/3
+++ b/3
@ -5,7 +5,8 @@ With LinkChecker you can check your HTML documents for broken links.
 Features:
 o recursive checking
 o multithreaded
-o output can be colored or normal text, HTML, SQL or a GML sitemap graph
+o output can be colored or normal text, HTML, SQL, CSV or a GML sitemap 
+  graph
 o HTTP/1.1, HTTPS, FTP, mailto:, news:, Gopher, Telnet and local file links 
  are supported
  Javascript links are currently ignored
--- a/debian/changelog
+++ b/debian/changelog
@ -1,4 +1,4 @@
-linkchecker (1.2.3) unstable; urgency=low
+linkchecker (1.3.0) unstable; urgency=low

  * Blacklist output support
  * typo fix for adjustWinPath
@ -8,8 +8,9 @@ linkchecker (1.2.3) unstable; urgency=low
  * linkchecker.bat installation support for windows
  * included test suite in distribution
  * Improved mailto: link parsing
+  * CSV output support

- -- Bastian Kleineidam <calvin@users.sourceforge.net>  Thu, 27 Apr 2000 10:18:52 +0200
+ -- Bastian Kleineidam <calvin@users.sourceforge.net>  Fri, 28 Apr 2000 12:59:13 +0200

 linkchecker (1.2.2) unstable; urgency=low

--- a/linkcheck/Config.py
+++ b/linkcheck/Config.py
@ -8,9 +8,10 @@ This module stores

 import ConfigParser,sys,os,re,UserDict,string
 from os.path import expanduser,normpath,normcase,join,isfile
+from types import StringType
 import Logging

-Version = "1.2.3"
+Version = "1.3.0"
 AppName = "LinkChecker"
 App = AppName+" "+Version
 UserAgent = AppName+"/"+Version
@ -32,20 +33,12 @@ Loggers = {
    "colored": Logging.ColoredLogger,
    "gml": Logging.GMLLogger,
    "sql": Logging.SQLLogger,
+    "csv": Logging.CSVLogger,
    "blacklist": Logging.BlacklistLogger,
 }
 # for easy printing: a comma separated logger list
 LoggerKeys = reduce(lambda x, y: x+", "+y, Loggers.keys())

-# File output names
-FileOutput = {
-    "text":    "linkchecker-out.txt",
-    "html":    "linkchecker-out.html",
-    "colored": "linkchecker-out.asc",
-    "gml":     "linkchecker-out.gml",
-    "sql":     "linkchecker-out.sql"
-}
-
 # debug options
 DebugDelim = "==========================================================\n"
 DebugFlag = 0
@ -86,6 +79,14 @@ class Configuration(UserDict.UserDict):
        self.data["robotstxt"] = 0
        self.data["strict"] = 0
        self.data["fileoutput"] = []
+        self.data["fileoutputnames"] = {
+            "text":    "linkchecker-out.txt",
+            "html":    "linkchecker-out.html",
+            "colored": "linkchecker-out.asc",
+            "gml":     "linkchecker-out.gml",
+            "sql":     "linkchecker-out.sql",
+            "csv":     "linkchecker-out.csv",
+        }
        self.data["quiet"] = 0
        self.data["warningregex"] = None
        self.data["nntpserver"] = os.environ.get("NNTP_SERVER",None)
@ -338,13 +339,20 @@ class Configuration(UserDict.UserDict):
        except ConfigParser.Error: pass
        try: self.data["warnings"] = cfgparser.getboolean(section, "warnings")
        except ConfigParser.Error: pass
+        try:
+	    filenames = eval(cfgparser.get(section, "fileoutputnames"))
+            for key in filenames.keys():
+                if self.data["fileoutputnames"].has_key(key) and \
+                   type(filenames[key]) == StringType:
+                    self.data["fileoutputnames"] = filenames[key]
+        except ConfigParser.Error: pass
        try:
            filelist = string.split(cfgparser.get(section, "fileoutput"))
            for arg in filelist:
                # no file output for the blacklist Logger
                if Loggers.has_key(arg) and arg != "blacklist":
 		    self.data["fileoutput"].append(Loggers[arg](
-		        open(FileOutput[arg], "w")))
+		        open(self.data["fileoutputnames"][arg], "w")))
 	except ConfigParser.Error: pass

        section="checking"
--- a/linkcheck/Logging.py
+++ b/linkcheck/Logging.py
@ -425,3 +425,33 @@ class BlacklistLogger:
            if self.blacklist[url] is None:
                fd.write(url+"\n")

+
+class CSVLogger(StandardLogger):
+    """ CSV output. CSV consists of one line per entry. Entries are
+    separated by a semicolon.
+    """
+    def init(self):
+        self.fd.write("# created by "+Config.AppName+" at "+
+                _strtime(time.time())+
+		"\n# you get "+Config.AppName+" at "+Config.Url+
+		"\n# write comments and bugs to "+Config.Email+"\n\n")
+        self.fd.flush()
+
+    def newUrl(self, urlData):
+        self.fd.write(`urlData.urlName`+';'+
+		      `urlData.recursionLevel`+';'+
+		      `urlData.parentName`+';'+
+                      `urlData.baseRef`+';'+
+                      `urlData.errorString`+';'+
+                      `urlData.validString`+';'+
+                      `urlData.warningString`+';'+
+                      `urlData.infoString`+';'+
+                      `urlData.valid`+';'+
+                      `urlData.url`+';'+
+                      `urlData.line`+';'+
+                      `urlData.cached`+'\n')
+        self.fd.flush()
+
+    def endOfOutput(self):
+        self.fd = None
+
--- a/linkcheck/MailtoUrlData.py
+++ b/linkcheck/MailtoUrlData.py
@ -3,29 +3,31 @@ from HostCheckingUrlData import HostCheckingUrlData
 from smtplib import SMTP
 from UrlData import LinkCheckerException

-mailto_re = re.compile(r"^mailto:"
-                       r"(['\-\w.]+@[\-\w.]+(\?.+)?|"
-		       r"[\w\s]+<['\-\w.]+@[\-\w.]+(\?.+)?>)$")
+# regular expression strings
+tag_str = r"^mailto:"
+adress_str = r"([a-zA-Z]['\-\w.]*)@([\w\-]+(\.[\w\-]+)*))"
+complete_adress_str = "("+adress_str+"|[\w\-\s]*<"+adress_str+">)"
+suffix_str = r"(\?.+)?"
+mailto_str = tag_str+complete_adress_str+\
+             "(\s*,"+complete_adress_str+")*"+suffix_str
+
+# compiled
+mailto_re = re.compile(mailto_str)
+adress_re = re.compile(adress_str)
+
 class MailtoUrlData(HostCheckingUrlData):
    "Url link with mailto scheme"
    
    def buildUrl(self):
        HostCheckingUrlData.buildUrl(self)
-        if not mailto_re.match(self.urlName):
+        mo = mailto_re.match(self.urlName)
+        if not mo:
            raise LinkCheckerException, "Illegal mailto link syntax"
-        self.host = self.urlName[7:]
-        i = string.find(self.host, "<")
-        j = string.find(self.host, ">")
-        if i!=-1 and j!=-1 and i<j:
-            self.host = self.host[i+1:j]
-        i = string.find(self.host, "@")
-        self.user = self.host[:i]
-        self.host = self.host[(i+1):]
-        i = string.find(self.host, "?")
-        if i!=-1:
-            self.host = self.host[:i]
-        self.host = string.lower(self.host)
-        # do not lower the user name
+        self.adresses = re.findall(adress_re, self.urlName)
+        Config.debug(str(self.adresses))
+        raise Exception, "Nix"
+        self.host = None
+        self.user = None

    def checkConnection(self, config):
        DNS.ParseResolvConf()
--- a/setup.py
+++ b/setup.py
@ -4,9 +4,15 @@ from distutils.dist import Distribution
 from Template import Template
 import sys

-# Hack for linkchecker.bat
+# Autodetect the existence of an SSL library (this is pretty shitty)
+# Autodetect Windows platforms to include the linkchecker.bat script
 class LCDistribution(Distribution):
    def run_commands (self):
+        if self.has_ssl():
+            self.ext_modules = [('ssl', {'sources': ['ssl.c'],
+                        'include_dirs': ['/usr/include/openssl'],
+                        'library_dirs': ['/usr/lib'],
+                        'libs': ['ssl']})]
        if sys.platform=='win32':
            inst = self.find_command_obj("install")
            inst.ensure_ready()
@ -18,23 +24,34 @@ class LCDistribution(Distribution):
        for cmd in self.commands:
            self.run_command (cmd)

+    def has_ssl(self):
+        return 1
+

 setup (name = "linkchecker",
-       version = "1.2.3",
+       version = "1.3.0",
       description = "check links of HTML pages",
       author = "Bastian Kleineidam",
       author_email = "calvin@users.sourceforge.net",
       url = "http://linkchecker.sourceforge.net/",
       licence = "GPL",
+       long_description =
+"""With LinkChecker you can check your HTML documents for broken links.
+Features:
+o recursive checking
+o multithreaded
+o output can be colored or normal text, HTML, SQL, CSV or a GML sitemap
+  graph
+o HTTP/1.1, HTTPS, FTP, mailto:, news:, Gopher, Telnet and local file links 
+  are supported.
+  Javascript links are currently ignored
+o restrict link checking to your local domain
+o HTTP proxy support
+o give username/password for HTTP and FTP authorization
+o robots.txt exclusion protocol support 
+"""

       distclass = LCDistribution,
       packages = ['','DNS','linkcheck'],
-       # uncomment ext_modules to enable HTTPS support
-       # you must have an SSL library and the Python header
-       # files installed
-       ext_modules = [('ssl', {'sources': ['ssl.c'],
-                        'include_dirs': ['/usr/include/openssl'],
-                        'library_dirs': ['/usr/lib'],
-                        'libs': ['ssl']})],
       scripts = ['linkchecker'],
-       )
+)
--- a/test/mail.html
+++ b/test/mail.html
@ -0,0 +1,14 @@
+<!-- extra mail checking -->
+<html><head></head>
+<body>
+<!-- legal -->
+<a href=mailto:calvin@localhost?subject=Hallo!%%&to=Pfuscher>1</a>
+<a href="mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla">2</a>
+<a href="mailto:Bastian Kleineidam <calvin@host1>?foo=bar">3</a>
+<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>">4</a>
+<a href="mailto:o'hara@doctor.fraggle-hause?subject=äöü">5</a>
+<!-- illegal -->
+<a href="mailto:Bastian Kleineidam <calvin@host1?foo=bar>">3</a>
+<a href="mailto:">6</a>
+</body>
+</html>
--- a/test/test2.html
+++ b/test/test2.html
@ -9,11 +9,6 @@
 <a href="test1.html">
 <a href="test1.html#isnix">
 <a href="test1.html#iswas">
-<a href=mailto:calvin@localhost?subject=Hallo!%%&to=Pfuscher>
-<a href="mailto:Bastian Kleineidam <calvin@host1?foo=bar>">
-<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>">
-<a href="mailto:o'hara@doctor.fraggle-hause?subject=äöü">
-<a href="mailto:">
 <a href="telnet:localhost">
 <a href="telnet:">
 <a href="ftp:/treasure.calvinsplayground.de/pub">