diff --git a/Makefile b/Makefile index c19c810b..643151b9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ VERSION=$(shell ./setup.py -V) HOST=treasure.calvinsplayground.de -PROXY=-P$(HOST):5050 +PROXY= +#PROXY=-P$(HOST):5050 #HOST=fsinfo.cs.uni-sb.de #PROXY=-Pwww-proxy.uni-sb.de:3128 PACKAGE = linkchecker diff --git a/linkcheck/Config.py b/linkcheck/Config.py index 4c03a445..81794253 100644 --- a/linkcheck/Config.py +++ b/linkcheck/Config.py @@ -58,9 +58,9 @@ class Configuration(UserDict.UserDict): self.urlCache = {} self.robotsTxtCache = {} try: - from threading import * + import threading self.enableThreading(5) - except: + except ImportError: type, value = sys.exc_info()[:2] self.disableThreading() @@ -243,7 +243,7 @@ class Configuration(UserDict.UserDict): try: cfgparser = ConfigParser.ConfigParser() cfgparser.read(files) - except: + except ConfigParser.Error: return section="output" @@ -253,16 +253,16 @@ class Configuration(UserDict.UserDict): self.data["log"] = Loggers[log]() else: self.warn("invalid log option "+log) - except: pass + except ConfigParser.Error: pass try: if cfgparser.getboolean(section, "verbose"): self.data["verbose"] = 1 self.data["warnings"] = 1 - except: pass + except ConfigParser.Error: pass try: self.data["quiet"] = cfgparser.getboolean(section, "quiet") - except: pass + except ConfigParser.Error: pass try: self.data["warnings"] = cfgparser.getboolean(section, "warnings") - except: pass + except ConfigParser.Error: pass section="checking" try: @@ -271,29 +271,29 @@ class Configuration(UserDict.UserDict): self.disableThreads() else: self.enableThreads(num) - except: pass + except ConfigParser.Error: pass try: self.data["anchors"] = cfgparser.getboolean(section, "anchors") - except: pass + except ConfigParser.Error: pass try: self.data["proxy"] = cfgparser.get(section, "proxy") self.data["proxyport"] = cfgparser.getint(section, "proxyport") - except: pass + except ConfigParser.Error: pass try: num = cfgparser.getint(section, "recursionlevel") if num<0: self.error("illegal recursionlevel number: "+`num`) self.data["recursionlevel"] = num - except: pass + except ConfigParser.Error: pass try: self.data["robotstxt"] = cfgparser.getboolean(section, "robotstxt") - except: pass + except ConfigParser.Error: pass try: self.data["strict"] = cfgparser.getboolean(section, "strict") - except: pass + except ConfigParser.Error: pass try: filelist = string.split(cfgparser.get(section, "fileoutput")) for arg in filelist: if Loggers.has_key(arg): self.data["fileoutput"].append(Loggers[arg](open("pylice-out."+arg, "w"))) - except: pass + except ConfigParser.Error: pass section = "authentication" try: @@ -304,7 +304,7 @@ class Configuration(UserDict.UserDict): tuple[0] = re.compile(tuple[0]) self.data["authentication"].append(tuple) i = i + 1 - except: pass + except ConfigParser.Error: pass self.data["authentication"].append((re.compile(".*"), "anonymous", "guest@")) section = "filtering" @@ -315,9 +315,9 @@ class Configuration(UserDict.UserDict): if len(tuple)!=2: break self.data["externlinks"].append((re.compile(tuple[0]), int(tuple[1]))) - except: pass + except ConfigParser.Error: pass try: self.data["internlinks"].append(re.compile(cfgparser.get(section, "internlinks"))) - except: pass + except ConfigParser.Error: pass try: self.data["allowdeny"] = cfgparser.getboolean(section, "allowdeny") - except: pass + except ConfigParser.Error: pass diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py index e608eabb..517ec9bc 100644 --- a/linkcheck/FileUrlData.py +++ b/linkcheck/FileUrlData.py @@ -9,12 +9,12 @@ class FileUrlData(UrlData): urlName, recursionLevel, parentName = None, - baseRef = None, line=0, _time=0): + baseRef = None, line=0): UrlData.__init__(self, urlName, - recursionLevel, - parentName, - baseRef, line, _time) + recursionLevel, + parentName=parentName, + baseRef=baseRef, line=line) if not parentName and not baseRef and \ not re.compile("^file:").search(self.urlName): winre = re.compile("^[a-zA-Z]:") diff --git a/linkcheck/HostCheckingUrlData.py b/linkcheck/HostCheckingUrlData.py index f8448206..1411ef42 100644 --- a/linkcheck/HostCheckingUrlData.py +++ b/linkcheck/HostCheckingUrlData.py @@ -8,9 +8,9 @@ class HostCheckingUrlData(UrlData): urlName, recursionLevel, parentName = None, - baseRef = None, line=0, _time=0): - UrlData.__init__(self, urlName, recursionLevel, parentName, baseRef, - line, _time) + baseRef = None, line=0): + UrlData.__init__(self, urlName, recursionLevel, + parentName=parentName, baseRef=baseRef, line=line) self.host = None self.url = urlName diff --git a/linkcheck/MailtoUrlData.py b/linkcheck/MailtoUrlData.py index 527a8ef9..b9cc3b85 100644 --- a/linkcheck/MailtoUrlData.py +++ b/linkcheck/MailtoUrlData.py @@ -1,14 +1,17 @@ import re,socket,string,DNS,sys from HostCheckingUrlData import HostCheckingUrlData from smtplib import SMTP +from UrlData import LinkCheckException +mailto_re = re.compile("^mailto:" + "([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)$") class MailtoUrlData(HostCheckingUrlData): "Url link with mailto scheme" def buildUrl(self): HostCheckingUrlData.buildUrl(self) - if not re.compile("^mailto:([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)").match(self.urlName): - raise Exception, "Illegal mailto link syntax" + if not mailto_re.match(self.urlName): + raise LinkCheckException, "Illegal mailto link syntax" self.host = self.urlName[7:] i = string.find(self.host, "<") j = string.find(self.host, ">") @@ -44,7 +47,8 @@ class MailtoUrlData(HostCheckingUrlData): if smtpconnect: break if not smtpconnect: - self.setWarning("None of the mail hosts for "+self.host+" accepts an SMTP connection") + self.setWarning("None of the mail hosts for "+self.host+ + " accepts an SMTP connection, "+value) mxrecord = mxrecords[0][1] else: mxrecord = mxrecord[1] diff --git a/linkcheck/OutputReader.py b/linkcheck/OutputReader.py index 1e31f179..04479b7d 100644 --- a/linkcheck/OutputReader.py +++ b/linkcheck/OutputReader.py @@ -3,17 +3,19 @@ import UrlData class ParseException(Exception): pass - -class OutputReader: - ws = re.compile("\s+") - regex_realUrl = re.compile("^Real URL.+") - regex_result = re.compile("^Result.+") - regex_base = re.compile("^Base.+") - regex_info = re.compile("^Info.+") - regex_warning = re.compile("^Warning.+") - regex_parentUrl = re.compile("^Parent URL.+") - regex_valid = re.compile("^Valid.*") + +ws = re.compile("\s+") +regex_realUrl = re.compile("^Real URL.+") +regex_result = re.compile("^Result.+") +regex_base = re.compile("^Base.+") +regex_info = re.compile("^Info.+") +regex_warning = re.compile("^Warning.+") +regex_parentUrl = re.compile("^Parent URL.+") +regex_valid = re.compile("^Valid.*") + + +class OutputReader: def resetState(self): self.urlName = None @@ -32,7 +34,7 @@ class OutputReader: self.resetState() while line: - if OutputReader.ws.match(line): + if ws.match(line): if self.state>=2: #append url urldata = UrlData.GetUrlDataFrom(self.urlName, 0, @@ -52,19 +54,19 @@ class OutputReader: raise ParseException, "No Real URL and Result keyword found" self.resetState() - elif OutputReader.regex_realUrl.match(line): + elif regex_realUrl.match(line): self.state = self.state+1 self.urlName = string.strip(line[8:]) - elif OutputReader.regex_result.match(line): + elif regex_result.match(line): self.state = self.state+1 self.result = string.strip(line[6:]) - elif OutputReader.regex_info.match(line): + elif regex_info.match(line): self.info = string.strip(line[4:]) - elif OutputReader.regex_base.match(line): + elif regex_base.match(line): self.baseRef = string.strip(line[4:]) - elif OutputReader.regex_warning.match(line): + elif regex_warning.match(line): self.warning = string.strip(line[7:]) - elif OutputReader.regex_parentUrl.match(line): + elif regex_parentUrl.match(line): self.parentName = string.strip(line[10:]) if ',' in self.parentName: self.parentName,self.linenumber = string.split(self.parentName,",",1) diff --git a/linkcheck/TelnetUrlData.py b/linkcheck/TelnetUrlData.py index bbdf6074..7715f8d4 100644 --- a/linkcheck/TelnetUrlData.py +++ b/linkcheck/TelnetUrlData.py @@ -1,13 +1,16 @@ import telnetlib,re from HostCheckingUrlData import HostCheckingUrlData +from UrlData import LinkCheckException + +telnet_re = re.compile("^telnet:[\w.\-]+$") class TelnetUrlData(HostCheckingUrlData): "Url link with telnet scheme" def buildUrl(self): HostCheckingUrlData.buildUrl(self) - if not re.compile("^telnet:[\w.\-]+").match(self.urlName): - raise Exception, "Illegal telnet link syntax" + if not telnet_re.match(self.urlName): + raise LinkCheckException, "Illegal telnet link syntax" self.host = string.lower(self.urlName[7:]) diff --git a/linkcheck/Threader.py b/linkcheck/Threader.py index 260a22aa..774450b9 100644 --- a/linkcheck/Threader.py +++ b/linkcheck/Threader.py @@ -24,8 +24,7 @@ class Threader: def finish(self): self.reduceThreads() - for t in self.threads: - pass # dont know how to stop a thread + # dont know how to stop a thread def startThread(self, callable, args): "Generate a new thread" diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index a1af381f..9192b543 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -7,9 +7,13 @@ LinkTags = [("a", "href"), ("body", "background"), ("frame", "src"), ("link", "href"), - ("meta", "url"), # + # + ("meta", "url"), ("area", "href")] +class LinkCheckException(Exception): + pass + class UrlData: "Representing a URL with additional information like validity etc" @@ -100,9 +104,8 @@ class UrlData: try: self.buildUrl() self.extern = self._getExtern(config) - except: - type, value = sys.exc_info()[:2] - self.setError(str(value)) + except LinkCheckerException, msg: + self.setError(msg) self.logMe(config) return @@ -143,10 +146,11 @@ class UrlData: def closeConnection(self): # brute force closing - try: self.urlConnection.close() - except: pass - # release variable for garbage collection - self.urlConnection = None + if self.urlConnection is not None: + try: self.urlConnection.close() + except: pass + # release variable for garbage collection + self.urlConnection = None def putInCache(self, config): cacheKey = self.getCacheKey() @@ -276,7 +280,7 @@ from TelnetUrlData import TelnetUrlData def GetUrlDataFrom(urlName, recursionLevel, parentName = None, - baseRef = None, line = 0, _time = 0): + baseRef = None, line = 0): # search for the absolute url name="" if urlName and ":" in urlName: @@ -287,21 +291,21 @@ def GetUrlDataFrom(urlName, name = string.lower(parentName) # test scheme if re.compile("^http:").search(name): - return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.compile("^ftp:").search(name): - return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.compile("^file:").search(name): - return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return FileUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.compile("^telnet:").search(name): - return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.compile("^mailto:").search(name): - return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.compile("^gopher:").search(name): - return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.compile("^javascript:").search(name): - return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.compile("^https:").search(name): - return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line) # assume local file - return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time) + return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)