linkchecker/CSV.py

#
# CSV 0.17  8 June 1999    Copyright ©Laurence Tratt 1998 - 1999
# e-mail: tratt@dcs.kcl.ac.uk
# home-page: http://eh.org/~laurie/comp/python/csv/index.html
#
#
#
# CSV.py is copyright ©1998 - 1999 by Laurence Tratt
#
# All rights reserved
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose and without fee is hereby granted, provided that
# the above copyright notice appear in all copies and that both that copyright
# notice and this permission notice appear in supporting documentation.
#
# THE AUTHOR - LAURENCE TRATT - DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
# NO EVENT SHALL THE AUTHOR FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
# ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS ACTION, ARISING OUT OF OR
# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#


import re, string, types, UserList


###################################################################################################
#
# CSV class
#


class CSV(UserList.UserList):

	""" Manage a CSV (comma separated values) file

        The data is held in a list.

        Methods:
          __init__()
          load()    load from file
          save()    save to file
          input()   input from string
          output()  save to string
          append()  appends one entry
          __str__() printable represenation
	"""


	def __init__(self, separator = ','):

		""" Initialise CVS class instance.

            Arguments:
              separator        : The field delimiter. Defaults to ','
        """

		self.separator = separator

		self.data = []
		self.fields__title__have = self.fields__title = None


	def load(self, file__data__name, fields__title__have, convert_numbers = 0, separator = None, comments = None):

		""" Load up a CSV file

            Arguments:
              file__data__name    : The name of the CSV file
              fields__title__have : 0         : file has no title fields
                                    otherwise : file has title fields
              convert_numbers     : 0         : store everything as string's
                                    otherwise : store fields that can be converted
                                                to ints or floats to that Python
                                                type defaults to 0
              separator           : The field delimiter (optional)
              comments            : A list of strings and regular expressions to remove comments
		"""

		file__data = open(file__data__name, 'r')
		self.input(file__data.read(-1), fields__title__have, convert_numbers, separator or self.separator, comments or ["#"])
		file__data.close()


	def save(self, file__data__name, separator = None):

		""" Save data to CSV file.

            Arguments:
              file__data__name : The name of the CSV file to save to
              separator        : The field delimiter (optional)
		"""

		file__data = open(file__data__name, 'w')
		file__data.write(self.output(separator or self.separator))
		file__data.close()


	def input(self, data, fields__title__have, convert_numbers = 0, separator = None, comments = None):

		""" Take wodge of CSV data & convert it into internal format.

            Arguments:
              data                : A string containing the CSV data
              fields__title__have : 0         : file has no title fields
                                    otherwise : file has title fields
              convert_numbers     : 0         : store everything as string's
                                    otherwise : store fields that can be
                                                converted to ints or
                                                floats to that Python type
                                                defaults to 0
              separator           : The field delimiter (Optional)
              comments            : A list of strings and regular expressions to remove comments
                                      (defaults to ["#"])
		"""

		def line__process(line, convert_numbers, separator):

			fields = []
			line__pos = 0

			while line__pos < len(line):

				# Skip any space at the beginning of the field (if there should be leading space,
				#   there should be a " character in the CSV file)

				while line__pos < len(line) and line[line__pos] == " ":
					line__pos = line__pos + 1

				field = ""
				quotes__level = 0
				while line__pos < len(line):

					# Skip space at the end of a field (if there is trailing space, it should be
					#   encompassed by speech marks)

					if quotes__level == 0 and line[line__pos] == " ":
						line__pos__temp = line__pos
						while line__pos__temp < len(line) and line[line__pos__temp] == " ":
							line__pos__temp = line__pos__temp + 1
						if line__pos__temp >= len(line):
							break
						elif line[line__pos__temp : line__pos__temp + len(separator)] == separator:
							line__pos = line__pos__temp
					if quotes__level == 0 and line[line__pos : line__pos + len(separator)] == separator:
						break
					elif line[line__pos] == "\"":
						if quotes__level == 0:
							quotes__level = 1
						else:
							quotes__level = 0
					else:
						field = field + line[line__pos]
					line__pos = line__pos + 1
				line__pos = line__pos + len(separator)
				if convert_numbers:
					for char in field:
						if char not in "0123456789.-":
							fields.append(field)
							break
					else:
						try:
							if "." not in field:
								fields.append(int(field))
							else:
								fields.append(float(field))
						except:
							fields.append(field)
				else:
					fields.append(field)
			if line[-len(separator)] == separator:
				fields.append(field)

			return fields


		separator = separator or self.separator
		comments = comments or ["#"]

		self.fields__title__have = fields__title__have

		# Remove comments from the input file

		comments__strings = []
		for comment in comments:
			if type(comment) == types.InstanceType:
				data = comment.sub("", data)
			elif type(comment) == types.StringType:
				comments__strings.append(comment)
			else:
				raise Exception("Invalid comment type '" + comment + "'")

		lines = map(string.strip, string.split(data, "\n"))

		# Remove all comments that are of type string

		lines__pos = 0
		while lines__pos < len(lines):
			line = lines[lines__pos]
			line__pos = 0
			while line__pos < len(line) and line[line__pos] == " ":
				line__pos = line__pos + 1
			found_comment = 0
			for comment in comments__strings:
				if line__pos + len(comment) < len(line) and line[line__pos : line__pos + len(comment)] == comment:
					found_comment = 1
					break
			if found_comment:
				del lines[lines__pos]
			else:
				lines__pos = lines__pos + 1

		# Process the input data

		if fields__title__have:
			self.fields__title = line__process(lines[0], convert_numbers, separator)
			pos__start = 1
		else:
			self.fields__title = []
			pos__start = 0
		self.data = []
		for line in lines[pos__start : ]:
			if line != "":
				self.data.append(Entry(line__process(line, convert_numbers, separator), self.fields__title))


	def output(self, separator = None):

		""" Convert internal data into CSV string.

            Arguments:
              separator        : The field delimiter (optional)

            Returns:
              String containing CSV data
		"""

		separator = separator or self.separator


		def line__make(entry, separator = separator):

			str = ""
			done__any = 0
			for field in entry:
				if done__any:
					str = str + separator
				else:
					done__any = 1
				if type(field) != types.StringType:
					field = `field`
				if len(field) > 0 and (string.find(field, separator) != -1 or (field[0] == " " or field[-1] == " ")):
					str = str + "\"" + field + "\""
				else:
					str = str + field

			return str


		if self.fields__title__have:
			str = line__make(self.fields__title) + "\n\n"
		else:
			str = ""
		str = str + string.join(map(line__make, self.data), "\n") + "\n"

		return str


	def append(self, entry):

		""" Add an entry. """

		if self.fields__title:
			entry.fields__title = self.fields__title
		self.data.append(entry)


	def field__append(self, func, field__title = None):

		""" Append a field with values specified by a function

            Arguments:
              func         : Function to be called func(entry) to get the value of the new field
              field__title : Name of new field (if applicable)

        """

		for data__pos in range(len(self)):
			entry = self.data[data__pos]
			entry.append(func(entry))
			self.data[data__pos] = entry

		if self.fields__title__have:
			self.fields__title.append(field__title)


	def duplicates__eliminate(self):

		""" Eliminate duplicates (this may result in a reordering of the entries) """

		# To eliminate duplicates, we first get Python to sort the list for us; then all we have to
		#   do is to check to see whether consecutive elements are the same, and delete them
		# This give us O(<sort>) * O(n) rather than the more obvious O(n * n) speed algorithm

		# XXX Could be done more efficiently for multiplicate duplicates by deleting a slice of
		#       similar elements rather than deleting them individually

		self.sort()
		data__pos = 1
		entry__last = self.data[0]
		while data__pos < len(self.data):
			if self.data[data__pos] == entry__last:
				del self.data[data__pos]
			else:
				entry__last = self.data[data__pos]
				data__pos = data__pos + 1


	def __str__(self):

		""" Construct a printable representation of the internal data. """

		columns__width = []

		# Work out the maximum width of each column

		for column in range(len(self.data[0])):
			if self.fields__title__have:
				width = len(`self.fields__title[column]`)
			else:
				width = 0
			for entry in self:
				width__possible = len(`entry.data[column]`)
				if width__possible > width:
					width = width__possible
			columns__width.append(width)

		if self.fields__title__have:
			str = string.join(map(string.ljust, self.fields__title, columns__width), "  ") + "\n\n"
		else:
			str = ""
		for entry in self:
			str = str + string.join(map(string.ljust, map(lambda a : (type(a) == types.StringType and [a] or [eval("`a`")])[0], entry.data), columns__width), "  ") + "\n"

		return str


###################################################################################################
#
# CSV data entry class
#
#


class Entry(UserList.UserList):

	""" CSV data entry, UserList subclass.

        Has the same properties as a list, but has a few dictionary
        like properties for easy access of fields if they have titles.

        Methods(Override):
          __init__
          __getitem__
          __setitem__
          __delitem__
	"""


	def __init__(self, fields, fields__title = None):

		""" Initialise with fields data and field title.

            Arguments:
              fields        : a list containing the data for each field
                              of this entry
              fields__title : a list with the titles of each field
                              (an empty list means there are no titles)
		"""

		self.data = fields
		if fields__title != None:
			self.fields__title = fields__title
		else:
			self.fields__title = []


	def __getitem__(self, x):

		if type(x) == types.IntType:
			return self.data[x]
		else:
			return self.data[self.fields__title.index(x)]


	def __setitem__(self, x, item):

		if type(x) == types.IntType:
			self.data[x] = item
		else:
			self.data[self.fields__title.index(x)] = item


	def __delitem__(self, x):

		if type(x) == types.IntType:
			del self.data[x]
		else:
			del self.data[self.fields__title.index(x)]


	def __str__(self):

		return `self.data`