diff --git a/linkcheck/AnsiColor.py b/linkcheck/AnsiColor.py deleted file mode 100644 index 743e1066..00000000 --- a/linkcheck/AnsiColor.py +++ /dev/null @@ -1,83 +0,0 @@ -# -*- coding: iso-8859-1 -*- -"""ANSI Color definitions and functions""" -# Copyright (C) 2000-2004 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -import os -import sys - -# Escape for ANSI colors -AnsiEsc = "\x1b[%sm" - -# type numbers -AnsiType = { - 'bold': '1', - 'light': '2', - 'blink': '5', - 'invert': '7', -} - -# color numbers (the capitalized colors are bright) -AnsiColor = { - 'default': '0', - 'black': '30', - 'red': '31', - 'green': '32', - 'yellow': '33', - 'blue': '34', - 'purple': '35', - 'cyan': '36', - 'white': '37', - 'Black': '40', - 'Red': '41', - 'Green': '42', - 'Yellow': '43', - 'Blue': '44', - 'Purple': '45', - 'Cyan': '46', - 'White': '47', -} - - -def esc_ansicolor (color): - """convert a named color definition to an escaped ANSI color""" - ctype = '' - if ";" in color: - ctype, color = color.split(";", 1) - if not AnsiType.has_key(ctype): - print >>sys.stderr, "invalid ANSI color type", repr(ctype) - print >>sys.stderr, "valid values are", AnsiType.keys() - ctype = '' - else: - ctype = AnsiType[ctype]+";" - if not AnsiColor.has_key(color): - print >>sys.stderr, "invalid ANSI color name", repr(color) - print >>sys.stderr, "valid values are", AnsiColor.keys() - cnum = '0' - else: - cnum = AnsiColor[color] - return AnsiEsc % (ctype+cnum) - -AnsiReset = esc_ansicolor("default") - - -def colorize (text, color=None): - "return text colorized if TERM is set" - if (color is not None) and os.environ.get('TERM'): - color = esc_ansicolor(color) - return '%s%s%s' % (color, text, AnsiReset) - else: - return text diff --git a/linkcheck/Config.py b/linkcheck/Config.py index 25f1f15a..22173ecf 100644 --- a/linkcheck/Config.py +++ b/linkcheck/Config.py @@ -24,10 +24,11 @@ import Cookie import sets import urllib import _linkchecker_configdata +import bk +import bk.containers import linkcheck import linkcheck.i18n -import linkcheck.log - +import linkcheck.Threader try: import threading except ImportError: @@ -69,7 +70,7 @@ def _check_morsel (m, host, path): return None # check expiry date (if its stored) if m["expires"]: - debug(BRING_IT_ON, "Cookie expires", m["expires"]) + bk.log.debug(linkcheck.LOG_CHECK, "Cookie expires", m["expires"]) # XXX return m.output(header='').strip() @@ -90,7 +91,6 @@ class Configuration (dict): # reduceThreads(). Ok, this is a hack but ItWorksForMe(tm). self.reduceCount = 0 - def reset (self): """Reset to default values""" self['linknumber'] = 0 @@ -174,55 +174,47 @@ class Configuration (dict): self.setThreads(10) self.urlSeen = sets.Set() self.urlSeenLock = threading.Lock() - self.urlCache = linkcheck.containers.LRU(MAX_URL_CACHE) + self.urlCache = bk.containers.LRU(MAX_URL_CACHE) self.urlCacheLock = threading.Lock() - self.robotsTxtCache = linkcheck.containers.LRU(MAX_ROBOTS_TXT_CACHE) + self.robotsTxtCache = bk.containers.LRU(MAX_ROBOTS_TXT_CACHE) self.robotsTxtCacheLock = threading.Lock() self.urls = [] self.urlCounter = 0 self.urlsLock = threading.Lock() # basic data lock (eg for cookies, link numbers etc.) self.dataLock = threading.Lock() - self.cookies = linkcheck.containers.LRU(MAX_COOKIES_CACHE) - + self.cookies = bk.containers.LRU(MAX_COOKIES_CACHE) def setThreads (self, num): - debug(HURT_ME_PLENTY, "set threading with %d threads"%num) + bk.log.debug(linkcheck.LOG_CHECK, "set threading with %d threads"%num) self.threader.threads_max = num if num>0: sys.setcheckinterval(50) else: sys.setcheckinterval(100) - def newLogger (self, logtype, dict={}): args = {} args.update(self[logtype]) args.update(dict) - from linkcheck.log import Loggers - return Loggers[logtype](**args) - + return linkcheck.Loggers[logtype](**args) def addLogger(self, logtype, loggerClass, logargs={}): "add a new logger type" - from linkcheck.log import Loggers - Loggers[logtype] = loggerClass + linkcheck.Loggers[logtype] = loggerClass self[logtype] = logargs - def log_init (self): if not self["quiet"]: self["log"].init() for log in self["fileoutput"]: log.init() - def log_endOfOutput (self): if not self["quiet"]: self["log"].endOfOutput(linknumber=self['linknumber']) for log in self["fileoutput"]: log.endOfOutput(linknumber=self['linknumber']) - def incrementLinknumber (self): try: self.dataLock.acquire() @@ -230,19 +222,15 @@ class Configuration (dict): finally: self.dataLock.release() - def hasMoreUrls (self): return self.urls - def finished (self): return self.threader.finished() and not self.urls - def finish (self): self.threader.finish() - def appendUrl (self, urlData): self.urlsLock.acquire() try: @@ -260,7 +248,6 @@ class Configuration (dict): finally: self.urlsLock.release() - def filterUrlQueue (self): """remove already cached urls from queue""" # note: url lock must be acquired @@ -270,7 +257,6 @@ class Configuration (dict): print >>sys.stderr, \ i18n._("removed %d cached urls from incoming queue")%removed - def getUrl (self): """get first url in queue and return it""" self.urlsLock.acquire() @@ -281,11 +267,9 @@ class Configuration (dict): finally: self.urlsLock.release() - def checkUrl (self, url): self.threader.start_thread(url.check, ()) - def urlSeen_has_key (self, key): self.urlSeenLock.acquire() try: @@ -293,7 +277,6 @@ class Configuration (dict): finally: self.urlSeenLock.release() - def urlSeen_set (self, key): self.urlSeenLock.acquire() try: @@ -301,7 +284,6 @@ class Configuration (dict): finally: self.urlSeenLock.release() - def urlCache_has_key (self, key): self.urlCacheLock.acquire() try: @@ -309,7 +291,6 @@ class Configuration (dict): finally: self.urlCacheLock.release() - def urlCache_get (self, key): self.urlCacheLock.acquire() try: @@ -317,16 +298,14 @@ class Configuration (dict): finally: self.urlCacheLock.release() - def urlCache_set (self, key, val): self.urlCacheLock.acquire() try: - debug(NIGHTMARE, "caching", repr(key)) + bk.log.debug(linkcheck.LOG_CHECK, "caching", repr(key)) self.urlCache[key] = val finally: self.urlCacheLock.release() - def robotsTxtCache_has_key (self, key): self.robotsTxtCacheLock.acquire() try: @@ -334,7 +313,6 @@ class Configuration (dict): finally: self.robotsTxtCacheLock.release() - def robotsTxtCache_get (self, key): self.robotsTxtCacheLock.acquire() try: @@ -342,7 +320,6 @@ class Configuration (dict): finally: self.robotsTxtCacheLock.release() - def robotsTxtCache_set (self, key, val): self.robotsTxtCacheLock.acquire() try: @@ -350,7 +327,6 @@ class Configuration (dict): finally: self.robotsTxtCacheLock.release() - def log_newUrl (self, url): self.logLock.acquire() try: @@ -361,25 +337,23 @@ class Configuration (dict): finally: self.logLock.release() - def storeCookies (self, headers, host): self.dataLock.acquire() try: output = [] for h in headers.getallmatchingheaders("Set-Cookie"): output.append(h) - debug(BRING_IT_ON, "Store Cookie", h) + bk.log.debug(linkcheck.LOG_CHECK, "Store Cookie", h) c = self.cookies.setdefault(host, Cookie.SimpleCookie()) c.load(h) return output finally: self.dataLock.release() - def getCookies (self, host, path): self.dataLock.acquire() try: - debug(BRING_IT_ON, "Get Cookie", host, path) + bk.log.debug(linkcheck.LOG_CHECK, "Get Cookie", host, path) if not self.cookies.has_key(host): return [] cookievals = [] @@ -391,7 +365,6 @@ class Configuration (dict): finally: self.dataLock.release() - def read (self, files = []): cfiles = files[:] if not cfiles: @@ -402,86 +375,107 @@ class Configuration (dict): cfiles.append(norm("~/.linkcheckerrc")) self.readConfig(cfiles) - def readConfig (self, files): """this big function reads all the configuration parameters used in the linkchecker module.""" - debug(BRING_IT_ON, "reading configuration from", files) - from linkcheck.log import Loggers + bk.log.debug(linkcheck.LOG_CHECK, "reading configuration from", files) try: cfgparser = ConfigParser.ConfigParser() cfgparser.read(files) except ConfigParser.Error, msg: - debug(BRING_IT_ON, msg) + bk.log.debug(linkcheck.LOG_CHECK, msg) return section="output" - for key in Loggers.keys(): + for key in linkcheck.Loggers.keys(): if cfgparser.has_section(key): for opt in cfgparser.options(key): try: self[key][opt] = cfgparser.get(key, opt) - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: self[key]['fields'] = [f.strip() for f in cfgparser.get(key, 'fields').split(',')] - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: log = cfgparser.get(section, "log") - if Loggers.has_key(log): + if linkcheck.Loggers.has_key(log): self['log'] = self.newLogger(log) else: warn(i18n._("invalid log option '%s'") % log) - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: if cfgparser.getboolean(section, "verbose"): self["verbose"] = True self["warnings"] = True - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) - try: self["quiet"] = cfgparser.getboolean(section, "quiet") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) - try: self["status"] = cfgparser.getboolean(section, "status") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) - try: self["warnings"] = cfgparser.getboolean(section, "warnings") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["quiet"] = cfgparser.getboolean(section, "quiet") + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["status"] = cfgparser.getboolean(section, "status") + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["warnings"] = cfgparser.getboolean(section, "warnings") + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: filelist = cfgparser.get(section, "fileoutput").split(",") for arg in filelist: arg = arg.strip() # no file output for the blacklist and none Logger - if Loggers.has_key(arg) and arg not in ["blacklist", "none"]: + if linkcheck.Loggers.has_key(arg) and arg not in ["blacklist", "none"]: self['fileoutput'].append( self.newLogger(arg, {'fileoutput':1})) - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) section="checking" try: num = cfgparser.getint(section, "threads") self.setThreads(num) - except ConfigParser.Error: debug(NIGHTMARE, msg) - try: self["anchors"] = cfgparser.getboolean(section, "anchors") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["anchors"] = cfgparser.getboolean(section, "anchors") + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: num = cfgparser.getint(section, "recursionlevel") self["recursionlevel"] = num - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) - try: self["strict"] = cfgparser.getboolean(section, "strict") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["strict"] = cfgparser.getboolean(section, "strict") + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: wr = cfgparser.get(section, "warningregex") if wr: self["warningregex"] = re.compile(wr) - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) - try: self["warnsizebytes"] = int(cfgparser.get(section, "warnsizebytes")) - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["warnsizebytes"] = int(cfgparser.get(section, "warnsizebytes")) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: self["nntpserver"] = cfgparser.get(section, "nntpserver") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: self["interactive"] = cfgparser.getboolean(section, "interactive") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) try: self["anchorcaching"] = cfgparser.getboolean(section, "anchorcaching") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) section = "authentication" try: @@ -494,7 +488,8 @@ class Configuration (dict): 'user': auth[1], 'password': auth[2]}) i += 1 - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) section = "filtering" try: @@ -506,8 +501,13 @@ class Configuration (dict): break self["externlinks"].append(linkcheck.getLinkPat(ctuple[0], strict=int(ctuple[1]))) i += 1 - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) - try: self["internlinks"].append(linkcheck.getLinkPat(cfgparser.get(section, "internlinks"))) - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) - try: self["denyallow"] = cfgparser.getboolean(section, "denyallow") - except ConfigParser.Error, msg: debug(NIGHTMARE, msg) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["internlinks"].append(linkcheck.getLinkPat(cfgparser.get(section, "internlinks"))) + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) + try: + self["denyallow"] = cfgparser.getboolean(section, "denyallow") + except ConfigParser.Error, msg: + bk.log.debug(linkcheck.LOG_CHECK, msg) diff --git a/linkcheck/DNS/__init__.py b/linkcheck/DNS/__init__.py index 288ee869..883c3b0e 100644 --- a/linkcheck/DNS/__init__.py +++ b/linkcheck/DNS/__init__.py @@ -11,57 +11,11 @@ __version__ = '2.3.0' -import Type, Opcode, Status, Class -from Base import DnsRequest, DNSError -from Lib import DnsResult -Error=DNSError -from lazy import * -Request = DnsRequest -Result = DnsResult +import Base +import Lib -import linkcheck.DNS.Base -linkcheck.DNS.Base.DiscoverNameServers() +Error = Base.DNSError +Request = Base.DnsRequest +Result = Lib.DnsResult -# -# $Log$ -# Revision 1.8 2004/07/07 18:01:59 calvin -# new module layout -# -# Revision 1.7 2003/07/04 14:23:22 calvin -# add coding line -# -# Revision 1.6 2003/01/05 17:52:53 calvin -# fix -# -# Revision 1.5 2003/01/05 17:39:19 calvin -# pychecker fixes -# -# Revision 1.4 2002/11/26 23:27:43 calvin -# update to Python >= 2.2.1 -# -# Revision 1.8 2002/05/06 06:17:49 anthonybaxter -# found that the old README file called itself release 2.2. So make -# this one 2.3... -# -# Revision 1.7 2002/05/06 06:16:15 anthonybaxter -# make some sort of reasonable version string. releasewards ho! -# -# Revision 1.6 2002/03/19 13:05:02 anthonybaxter -# converted to class based exceptions (there goes the python1.4 compatibility :) -# -# removed a quite gross use of 'eval()'. -# -# Revision 1.5 2002/03/19 12:41:33 anthonybaxter -# tabnannied and reindented everything. 4 space indent, no tabs. -# yay. -# -# Revision 1.4 2001/11/26 17:57:51 stroeder -# Added __version__ -# -# Revision 1.3 2001/08/09 09:08:55 anthonybaxter -# added identifying header to top of each file -# -# Revision 1.2 2001/07/19 06:57:07 anthony -# cvs keywords added -# -# +Base.DiscoverNameServers() diff --git a/linkcheck/StringUtil.py b/linkcheck/StringUtil.py index 84a2015e..f096efb4 100644 --- a/linkcheck/StringUtil.py +++ b/linkcheck/StringUtil.py @@ -86,20 +86,6 @@ def getLastWordBoundary (s, width): return width-1 -def applyTable (table, s): - "apply a table of replacement pairs to str" - for mapping in table: - s = s.replace(mapping[0], mapping[1]) - return s - - -def sqlify (s): - "Escape special SQL chars and strings" - if not s: - return "NULL" - return "'%s'"%applyTable(SQLTable, s) - - def htmlify (s): "Escape special HTML chars and strings" return applyTable(HtmlTable, s) diff --git a/linkcheck/XmlUtils.py b/linkcheck/XmlUtils.py deleted file mode 100644 index f4700562..00000000 --- a/linkcheck/XmlUtils.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: iso-8859-1 -*- -"""XML utility functions""" -# Copyright (C) 2003-2004 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -__version__ = "$Revision$"[11:-2] -__date__ = "$Date$"[7:-2] - -import xml.sax.saxutils - -xmlattr_entities = { - "&": "&", - "<": "<", - ">": ">", - "\"": """, -} - - -def xmlquote (s): - """quote characters for XML""" - return xml.sax.saxutils.escape(s) - - -def xmlquoteattr (s): - """quote XML attribute, ready for inclusion with double quotes""" - return xml.sax.saxutils.escape(s, xmlattr_entities) - - -def xmlunquote (s): - """unquote characters from XML""" - return xml.sax.saxutils.unescape(s) - - -def xmlunquoteattr (s): - """unquote attributes from XML""" - return xml.sax.saxutils.unescape(s, xmlattr_entities) diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py index 0cebd021..ac7903b9 100644 --- a/linkcheck/__init__.py +++ b/linkcheck/__init__.py @@ -16,9 +16,10 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -import re import sys -import urlparse +import re +import time +import linkcheck.i18n # logger areas @@ -32,9 +33,36 @@ class LinkCheckerError (Exception): pass +def strtime (t): + """return ISO 8601 formatted time""" + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \ + strtimezone() + + +def strduration (duration): + """return string formatted time duration""" + name = linkcheck.i18n._("seconds") + if duration > 60: + duration = duration / 60 + name = linkcheck.i18n._("minutes") + if duration > 60: + duration = duration / 60 + name = linkcheck.i18n._("hours") + return " %.3f %s"%(duration, name) + + +def strtimezone (): + """return timezone info, %z on some platforms, but not supported on all""" + if time.daylight: + zone = time.altzone + else: + zone = time.timezone + return "%+04d" % int(-zone/3600) + + def getLinkPat (arg, strict=False): """get a link pattern matcher for intern/extern links""" - linkcheck.log.debug(LOG_CHECK, "Link pattern %r", arg) + bk.log.debug(LOG_CHECK, "Link pattern %r", arg) if arg[0:1] == '!': pattern = arg[1:] negate = True @@ -48,48 +76,37 @@ def getLinkPat (arg, strict=False): } -# file extensions we can parse recursively -extensions = { - "html": re.compile(r'(?i)\.s?html?$'), - "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file - "css": re.compile(r'(?i)\.css$'), # CSS stylesheet -# "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'), -} - - -import linkcheck.FileUrlData -import linkcheck.IgnoredUrlData -import linkcheck.FtpUrlData -import linkcheck.GopherUrlData -import linkcheck.HttpUrlData -import linkcheck.HttpsUrlData -import linkcheck.MailtoUrlData -import linkcheck.TelnetUrlData -import linkcheck.NntpUrlData - -def set_intern_url (url, klass, config): - """Precondition: config['strict'] is true (ie strict checking) and - recursion level is zero (ie url given on the command line)""" - if klass == linkcheck.FileUrlData.FileUrlData: - linkcheck.log.debug(LOG_CHECK, "Add intern pattern ^file:") - config['internlinks'].append(getLinkPat("^file:")) - elif klass in [linkcheck.HttpUrlData.HttpUrlData, - linkcheck.HttpsUrlData.HttpsUrlData, - linkcheck.FtpUrlData.FtpUrlData]: - domain = urlparse.urlsplit(url)[1] - if domain: - domain = "://%s"%re.escape(domain) - debug(BRING_IT_ON, "Add intern domain", domain) - # add scheme colon to link pattern - config['internlinks'].append(getLinkPat(domain)) - - -import linkcheck.logger - def printStatus (config, curtime, start_time): tocheck = len(config.urls) links = config['linknumber'] active = config.threader.active_threads() - duration = linkcheck.logger.strduration(curtime - start_time) + duration = strduration(curtime - start_time) print >>sys.stderr, linkcheck.i18n._("%5d urls queued, %4d links checked, %2d active threads, runtime %s")%\ (tocheck, links, active, duration) + + +import linkcheck.logger.StandardLogger +import linkcheck.logger.HtmlLogger +import linkcheck.logger.ColoredLogger +import linkcheck.logger.GMLLogger +import linkcheck.logger.SQLLogger +import linkcheck.logger.CSVLogger +import linkcheck.logger.BlacklistLogger +import linkcheck.logger.XMLLogger +import linkcheck.logger.NoneLogger + + +# default logger classes +Loggers = { + "text": linkcheck.logger.StandardLogger.StandardLogger, + "html": linkcheck.logger.HtmlLogger.HtmlLogger, + "colored": linkcheck.logger.ColoredLogger.ColoredLogger, + "gml": linkcheck.logger.GMLLogger.GMLLogger, + "sql": linkcheck.logger.SQLLogger.SQLLogger, + "csv": linkcheck.logger.CSVLogger.CSVLogger, + "blacklist": linkcheck.logger.BlacklistLogger.BlacklistLogger, + "xml": linkcheck.logger.XMLLogger.XMLLogger, + "none": linkcheck.logger.NoneLogger.NoneLogger, +} +# for easy printing: a comma separated logger list +LoggerKeys = ", ".join(Loggers.keys()) diff --git a/linkcheck/checker/FileUrlData.py b/linkcheck/checker/FileUrlData.py index 28cbd77f..7299e369 100644 --- a/linkcheck/checker/FileUrlData.py +++ b/linkcheck/checker/FileUrlData.py @@ -19,10 +19,10 @@ import re import os import urlparse -import linkcheck.UrlData +import linkcheck.checker # OSError is thrown on Windows when a file is not found -linkcheck.UrlData.ExcList.append(OSError) +linkcheck.checker.ExcList.append(OSError) # if file extension was fruitless, look at the content contents = { diff --git a/linkcheck/checker/UrlData.py b/linkcheck/checker/UrlData.py index eabe5cca..f902900b 100644 --- a/linkcheck/checker/UrlData.py +++ b/linkcheck/checker/UrlData.py @@ -78,18 +78,6 @@ def get_absolute_url (urlName, baseRef, parentName): return "" -# we catch these exceptions, all other exceptions are internal -# or system errors -ExcList = [ - IOError, - ValueError, # from httplib.py - linkcheck.LinkCheckerError, - linkcheck.DNS.Error, - socket.timeout, - socket.error, - select.error, -] - if hasattr(socket, "sslerror"): ExcList.append(socket.sslerror) @@ -226,15 +214,13 @@ class UrlData (object): def check (self): try: self._check() - except KeyboardInterrupt: - raise except (socket.error, select.error): # on Unix, ctrl-c can raise # error: (4, 'Interrupted system call') etype, value = sys.exc_info()[:2] if etype!=4: raise - except linkcheck.test_support.Error: + except (KeyboardInterrupt, linkcheck.test_support.Error): raise except: internal_error() diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 5de372a4..68ced2e3 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -17,6 +17,25 @@ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import time +import socket +import select +import re +import urlparse +import linkcheck +import linkcheck.DNS + + +# we catch these exceptions, all other exceptions are internal +# or system errors +ExcList = [ + IOError, + ValueError, # from httplib.py + linkcheck.LinkCheckerError, + linkcheck.DNS.Error, + socket.timeout, + socket.error, + select.error, +] # main check function @@ -66,6 +85,32 @@ import linkcheck.checker.MailtoUrlData import linkcheck.checker.TelnetUrlData import linkcheck.checker.NntpUrlData +# file extensions we can parse recursively +extensions = { + "html": re.compile(r'(?i)\.s?html?$'), + "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file + "css": re.compile(r'(?i)\.css$'), # CSS stylesheet +# "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'), +} + + +def set_intern_url (url, klass, config): + """Precondition: config['strict'] is true (ie strict checking) and + recursion level is zero (ie url given on the command line)""" + if klass == linkcheck.checker.FileUrlData.FileUrlData: + linkcheck.log.debug(linkcheck.LOG_CHECK, "Add intern pattern ^file:") + config['internlinks'].append(getLinkPat("^file:")) + elif klass in [linkcheck.checker.HttpUrlData.HttpUrlData, + linkcheck.checker.HttpsUrlData.HttpsUrlData, + linkcheck.checker.FtpUrlData.FtpUrlData]: + domain = urlparse.urlsplit(url)[1] + if domain: + domain = "://%s"%re.escape(domain) + linkcheck.log.debug(linkcheck.LOG_CHECK, "Add intern domain", domain) + # add scheme colon to link pattern + config['internlinks'].append(getLinkPat(domain)) + + def getUrlDataFrom (urlName, recursionLevel, config, parentName=None, baseRef=None, line=0, column=0, name=None, cmdline=None): diff --git a/linkcheck/containers.py b/linkcheck/containers.py deleted file mode 100644 index 1562409b..00000000 --- a/linkcheck/containers.py +++ /dev/null @@ -1,200 +0,0 @@ -# -*- coding: iso-8859-1 -*- -"""special container classes""" -# Copyright (C) 2004 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -__version__ = "$Revision$"[11:-2] -__date__ = "$Date$"[7:-2] - - -class SetList (list): - """a list that eliminates all duplicates - """ - - def append (self, x): - """append only if not already there""" - if x not in self: - super(SetList, self).append(x) - - def extend (self, x): - """extend while eliminating duplicates by appending item for item""" - for i in x: - self.append(i) - - def insert (self, i, x): - """insert only if not already there""" - if x not in self: - super(SetList, self).insert(i, x) - - def __setitem__ (self, key, value): - """set new value, and eliminate old duplicates (if any)""" - oldvalues = [] - for i in range(len(self)): - if self[i]==value: - oldvalues.append(i) - super(SetList, self).__setitem__(key, value) - # remove old duplicates (from last to first) - oldvalues.reverse() - for i in oldvalues: - if i!=key: - del self[key] - - -class ListDict (dict): - """a dictionary whose iterators reflect the order in which elements - were added - """ - - def __init__ (self): - """initialize sorted key list""" - # sorted list of keys - self._keys = [] - - def __setitem__ (self, key, value): - """add key,value to dict, append key to sorted list""" - if not self.has_key(key): - self._keys.append(key) - super(ListDict, self).__setitem__(key, value) - - def __delitem__ (self, key): - """remove key from dict""" - self._keys.remove(key) - super(ListDict, self).__delitem__(key) - - def values (self): - """return sorted list of values""" - return [self[k] for k in self._keys] - - def items (self): - """return sorted list of items""" - return [(k, self[k]) for k in self._keys] - - def keys (self): - """return sorted list of keys""" - return self._keys[:] - - def itervalues (self): - """return iterator over sorted values""" - return iter(self.values()) - - def iteritems (self): - """return iterator over sorted items""" - return iter(self.items()) - - def iterkeys (self): - """return iterator over sorted keys""" - return iter(self.keys()) - - def clear (self): - """remove all dict entires""" - self._keys = [] - super(ListDict, self).clear() - - -class LRU (object): - """ - Implementation of a length-limited O(1) LRU queue. - Built for and used by PyPE: - http://pype.sourceforge.net - Copyright 2003 Josiah Carlson. (Licensed under the GPL) - """ - class Node (object): - def __init__ (self, prev, me): - self.prev = prev - self.me = me - self.next = None - - def __init__ (self, count, pairs=[]): - self.count = max(count, 1) - self.d = {} - self.first = None - self.last = None - for key, value in pairs: - self[key] = value - - def __contains__ (self, obj): - return obj in self.d - - def has_key (self, obj): - return self.d.has_key(obj) - - def __getitem__ (self, obj): - a = self.d[obj].me - self[a[0]] = a[1] - return a[1] - - def __setitem__ (self, obj, val): - if obj in self.d: - del self[obj] - nobj = self.Node(self.last, (obj, val)) - if self.first is None: - self.first = nobj - if self.last: - self.last.next = nobj - self.last = nobj - self.d[obj] = nobj - if len(self.d) > self.count: - if self.first == self.last: - self.first = None - self.last = None - return - a = self.first - a.next.prev = None - self.first = a.next - a.next = None - del self.d[a.me[0]] - del a - - def __delitem__ (self, obj): - nobj = self.d[obj] - if nobj.prev: - nobj.prev.next = nobj.next - else: - self.first = nobj.next - if nobj.next: - nobj.next.prev = nobj.prev - else: - self.last = nobj.prev - del self.d[obj] - - def __iter__ (self): - cur = self.first - while cur != None: - cur2 = cur.next - yield cur.me[1] - cur = cur2 - - def iteritems (self): - cur = self.first - while cur != None: - cur2 = cur.next - yield cur.me - cur = cur2 - - def iterkeys (self): - return iter(self.d) - - def itervalues (self): - for i,j in self.iteritems(): - yield j - - def keys (self): - return self.d.keys() - - def setdefault (self, key, failobj=None): - if not self.has_key(key): - self[key] = failobj - return self[key] diff --git a/linkcheck/i18n.py b/linkcheck/i18n.py deleted file mode 100644 index f5e324d8..00000000 --- a/linkcheck/i18n.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: iso-8859-1 -*- -"""internationalization support""" -# Copyright (C) 2000-2004 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -# i18n suppport -import os -from _linkchecker_configdata import install_data - -def init_gettext (): - global _ - try: - import gettext - domain = 'linkcheck' - localedir = os.path.join(install_data, 'share', 'locale') - _ = gettext.translation(domain, localedir).gettext - except (IOError, ImportError): - # default gettext function - _ = lambda s: s - -init_gettext() diff --git a/linkcheck/logger/BlacklistLogger.py b/linkcheck/logger/BlacklistLogger.py index b2c05310..21f3d668 100644 --- a/linkcheck/logger/BlacklistLogger.py +++ b/linkcheck/logger/BlacklistLogger.py @@ -17,10 +17,10 @@ import sys import os -import linkcheck +import linkcheck.logger.Logger -class BlacklistLogger (linkcheck.logger.Logger): +class BlacklistLogger (linkcheck.logger.Logger.Logger): """Updates a blacklist of wrong links. If a link on the blacklist is working (again), it is removed from the list. So after n days we have only links on the list which failed for n days. diff --git a/linkcheck/logger/CSVLogger.py b/linkcheck/logger/CSVLogger.py index 1679ac07..4608ea7b 100644 --- a/linkcheck/logger/CSVLogger.py +++ b/linkcheck/logger/CSVLogger.py @@ -17,7 +17,9 @@ import time import csv -import linkcheck +import linkcheck.i18n +import linkcheck.logger.StandardLogger +import linkcheck.logger.Logger class CSVLogger (linkcheck.logger.StandardLogger.StandardLogger): @@ -30,7 +32,7 @@ class CSVLogger (linkcheck.logger.StandardLogger.StandardLogger): self.lineterminator = "\n" def init (self): - linkcheck.logger.Logger.init(self) + linkcheck.logger.Logger.Logger.init(self) if self.fd is None: return self.starttime = time.time() diff --git a/linkcheck/logger/ColoredLogger.py b/linkcheck/logger/ColoredLogger.py index 288ae5d7..51e7808f 100644 --- a/linkcheck/logger/ColoredLogger.py +++ b/linkcheck/logger/ColoredLogger.py @@ -15,10 +15,12 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -import linkcheck +import linkcheck.i18n +import linkcheck.AnsiColor +import linkcheck.logger.StandardLogger -class ColoredLogger (linkcheck.logger.StandardLogger): +class ColoredLogger (linkcheck.logger.StandardLogger.StandardLogger): """ANSI colorized output""" def __init__ (self, **args): diff --git a/linkcheck/logger/NoneLogger.py b/linkcheck/logger/NoneLogger.py index 74d4b47c..f066aa4b 100644 --- a/linkcheck/logger/NoneLogger.py +++ b/linkcheck/logger/NoneLogger.py @@ -15,7 +15,7 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -import linkcheck +import linkcheck.logger.Logger class NoneLogger (linkcheck.logger.Logger.Logger): diff --git a/linkcheck/logger/SQLLogger.py b/linkcheck/logger/SQLLogger.py index 3e5d0fb3..b8aced22 100644 --- a/linkcheck/logger/SQLLogger.py +++ b/linkcheck/logger/SQLLogger.py @@ -17,6 +17,23 @@ import time import linkcheck +import linkcheck.i18n +import linkcheck.logger.StandardLogger +import linkcheck.logger.Logger + + +def applyTable (table, s): + "apply a table of replacement pairs to str" + for mapping in table: + s = s.replace(mapping[0], mapping[1]) + return s + + +def sqlify (s): + "Escape special SQL chars and strings" + if not s: + return "NULL" + return "'%s'"%applyTable(SQLTable, s) class SQLLogger (linkcheck.logger.StandardLogger.StandardLogger): @@ -33,7 +50,7 @@ class SQLLogger (linkcheck.logger.StandardLogger.StandardLogger): self.starttime = time.time() if self.has_field("intro"): self.fd.write("-- "+(linkcheck.i18n._("created by %s at %s\n") % (linkcheck.Config.AppName, - linkcheck.logger.strtime(self.starttime)))) + linkcheck.strtime(self.starttime)))) self.fd.write("-- "+(linkcheck.i18n._("Get the newest version at %s\n") % linkcheck.Config.Url)) self.fd.write("-- "+(linkcheck.i18n._("Write comments and bugs to %s\n\n") % \ linkcheck.Config.Email)) @@ -47,19 +64,19 @@ class SQLLogger (linkcheck.logger.StandardLogger.StandardLogger): " values " "(%s,%d,%s,%s,%s,%s,%s,%s,%d,%s,%d,%d,%s,%d,%d,%d,%d)%s\n" % \ (self.dbname, - linkcheck.StringUtil.sqlify(urlData.urlName), + sqlify(urlData.urlName), urlData.recursionLevel, - linkcheck.StringUtil.sqlify(linkcheck.url.url_quote(urlData.parentName or "")), - linkcheck.StringUtil.sqlify(urlData.baseRef), - linkcheck.StringUtil.sqlify(urlData.errorString), - linkcheck.StringUtil.sqlify(urlData.validString), - linkcheck.StringUtil.sqlify(urlData.warningString), - linkcheck.StringUtil.sqlify(urlData.infoString), + sqlify(linkcheck.url.url_quote(urlData.parentName or "")), + sqlify(urlData.baseRef), + sqlify(urlData.errorString), + sqlify(urlData.validString), + sqlify(urlData.warningString), + sqlify(urlData.infoString), urlData.valid, - linkcheck.StringUtil.sqlify(linkcheck.url.url_quote(urlData.url)), + sqlify(linkcheck.url.url_quote(urlData.url)), urlData.line, urlData.column, - linkcheck.StringUtil.sqlify(urlData.name), + sqlify(urlData.name), urlData.checktime, urlData.dltime, urlData.dlsize, diff --git a/linkcheck/logger/XMLLogger.py b/linkcheck/logger/XMLLogger.py index 3949a814..289d20a4 100644 --- a/linkcheck/logger/XMLLogger.py +++ b/linkcheck/logger/XMLLogger.py @@ -16,9 +16,38 @@ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import time +import xml.sax.saxutils import linkcheck +xmlattr_entities = { + "&": "&", + "<": "<", + ">": ">", + "\"": """, +} + + +def xmlquote (s): + """quote characters for XML""" + return xml.sax.saxutils.escape(s) + + +def xmlquoteattr (s): + """quote XML attribute, ready for inclusion with double quotes""" + return xml.sax.saxutils.escape(s, xmlattr_entities) + + +def xmlunquote (s): + """unquote characters from XML""" + return xml.sax.saxutils.unescape(s) + + +def xmlunquoteattr (s): + """unquote attributes from XML""" + return xml.sax.saxutils.unescape(s, xmlattr_entities) + + class XMLLogger (linkcheck.logger.StandardLogger.StandardLogger): """XML output mirroring the GML structure. Easy to parse with any XML tool.""" @@ -56,7 +85,7 @@ class XMLLogger (linkcheck.logger.StandardLogger.StandardLogger): self.fd.write(">\n") if self.has_field("realurl"): self.fd.write(" \n" %\ - linkcheck.XmlUtils.xmlquote(linkcheck.url.url_quote(node.url))) + xmlquote(linkcheck.url.url_quote(node.url))) self.fd.write(" \n") if node.dltime>=0 and self.has_field("dltime"): self.fd.write(" %f\n" % node.dltime) @@ -85,7 +114,7 @@ class XMLLogger (linkcheck.logger.StandardLogger.StandardLogger): self.fd.write(">\n") if self.has_field("url"): self.fd.write(" \n" % \ - linkcheck.XmlUtils.linkcheck.xmlquote(node.urlName)) + xmlquote(node.urlName)) self.fd.write(" \n") if self.has_field("result"): self.fd.write(" %d\n" % \ diff --git a/linkcheck/logger/__init__.py b/linkcheck/logger/__init__.py index 0fa3e900..b38ca970 100644 --- a/linkcheck/logger/__init__.py +++ b/linkcheck/logger/__init__.py @@ -16,60 +16,3 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -import time -import linkcheck -import linkcheck.i18n - - -def strtime (t): - """return ISO 8601 formatted time""" - return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \ - strtimezone() - - -def strduration (duration): - """return string formatted time duration""" - name = linkcheck.i18n._("seconds") - if duration > 60: - duration = duration / 60 - name = linkcheck.i18n._("minutes") - if duration > 60: - duration = duration / 60 - name = linkcheck.i18n._("hours") - return " %.3f %s"%(duration, name) - - -def strtimezone (): - """return timezone info, %z on some platforms, but not supported on all""" - if time.daylight: - zone = time.altzone - else: - zone = time.timezone - return "%+04d" % int(-zone/3600) - - -import linkcheck.logger.StandardLogger -import linkcheck.logger.HtmlLogger -import linkcheck.logger.ColoredLogger -import linkcheck.logger.GMLLogger -import linkcheck.logger.SQLLogger -import linkcheck.logger.CSVLogger -import linkcheck.logger.BlacklistLogger -import linkcheck.logger.XMLLogger -import linkcheck.logger.NoneLogger - - -# default logger classes -Loggers = { - "text": linkcheck.logger.StandardLogger.StandardLogger, - "html": linkcheck.logger.HtmlLogger.HtmlLogger, - "colored": linkcheck.logger.ColoredLogger.ColoredLogger, - "gml": linkcheck.logger.GMLLogger.GMLLogger, - "sql": linkcheck.logger.SQLLogger.SQLLogger, - "csv": linkcheck.logger.CSVLogger.CSVLogger, - "blacklist": linkcheck.logger.BlacklistLogger.BlacklistLogger, - "xml": linkcheck.logger.XMLLogger.XMLLogger, - "none": linkcheck.logger.NoneLogger.NoneLogger, -} -# for easy printing: a comma separated logger list -LoggerKeys = ", ".join(Loggers.keys())