diff --git a/Makefile b/Makefile
index c19c810b..643151b9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
VERSION=$(shell ./setup.py -V)
HOST=treasure.calvinsplayground.de
-PROXY=-P$(HOST):5050
+PROXY=
+#PROXY=-P$(HOST):5050
#HOST=fsinfo.cs.uni-sb.de
#PROXY=-Pwww-proxy.uni-sb.de:3128
PACKAGE = linkchecker
diff --git a/linkcheck/Config.py b/linkcheck/Config.py
index 4c03a445..81794253 100644
--- a/linkcheck/Config.py
+++ b/linkcheck/Config.py
@@ -58,9 +58,9 @@ class Configuration(UserDict.UserDict):
self.urlCache = {}
self.robotsTxtCache = {}
try:
- from threading import *
+ import threading
self.enableThreading(5)
- except:
+ except ImportError:
type, value = sys.exc_info()[:2]
self.disableThreading()
@@ -243,7 +243,7 @@ class Configuration(UserDict.UserDict):
try:
cfgparser = ConfigParser.ConfigParser()
cfgparser.read(files)
- except:
+ except ConfigParser.Error:
return
section="output"
@@ -253,16 +253,16 @@ class Configuration(UserDict.UserDict):
self.data["log"] = Loggers[log]()
else:
self.warn("invalid log option "+log)
- except: pass
+ except ConfigParser.Error: pass
try:
if cfgparser.getboolean(section, "verbose"):
self.data["verbose"] = 1
self.data["warnings"] = 1
- except: pass
+ except ConfigParser.Error: pass
try: self.data["quiet"] = cfgparser.getboolean(section, "quiet")
- except: pass
+ except ConfigParser.Error: pass
try: self.data["warnings"] = cfgparser.getboolean(section, "warnings")
- except: pass
+ except ConfigParser.Error: pass
section="checking"
try:
@@ -271,29 +271,29 @@ class Configuration(UserDict.UserDict):
self.disableThreads()
else:
self.enableThreads(num)
- except: pass
+ except ConfigParser.Error: pass
try: self.data["anchors"] = cfgparser.getboolean(section, "anchors")
- except: pass
+ except ConfigParser.Error: pass
try:
self.data["proxy"] = cfgparser.get(section, "proxy")
self.data["proxyport"] = cfgparser.getint(section, "proxyport")
- except: pass
+ except ConfigParser.Error: pass
try:
num = cfgparser.getint(section, "recursionlevel")
if num<0:
self.error("illegal recursionlevel number: "+`num`)
self.data["recursionlevel"] = num
- except: pass
+ except ConfigParser.Error: pass
try: self.data["robotstxt"] = cfgparser.getboolean(section, "robotstxt")
- except: pass
+ except ConfigParser.Error: pass
try: self.data["strict"] = cfgparser.getboolean(section, "strict")
- except: pass
+ except ConfigParser.Error: pass
try:
filelist = string.split(cfgparser.get(section, "fileoutput"))
for arg in filelist:
if Loggers.has_key(arg):
self.data["fileoutput"].append(Loggers[arg](open("pylice-out."+arg, "w")))
- except: pass
+ except ConfigParser.Error: pass
section = "authentication"
try:
@@ -304,7 +304,7 @@ class Configuration(UserDict.UserDict):
tuple[0] = re.compile(tuple[0])
self.data["authentication"].append(tuple)
i = i + 1
- except: pass
+ except ConfigParser.Error: pass
self.data["authentication"].append((re.compile(".*"), "anonymous", "guest@"))
section = "filtering"
@@ -315,9 +315,9 @@ class Configuration(UserDict.UserDict):
if len(tuple)!=2: break
self.data["externlinks"].append((re.compile(tuple[0]),
int(tuple[1])))
- except: pass
+ except ConfigParser.Error: pass
try: self.data["internlinks"].append(re.compile(cfgparser.get(section, "internlinks")))
- except: pass
+ except ConfigParser.Error: pass
try: self.data["allowdeny"] = cfgparser.getboolean(section, "allowdeny")
- except: pass
+ except ConfigParser.Error: pass
diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py
index e608eabb..517ec9bc 100644
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@@ -9,12 +9,12 @@ class FileUrlData(UrlData):
urlName,
recursionLevel,
parentName = None,
- baseRef = None, line=0, _time=0):
+ baseRef = None, line=0):
UrlData.__init__(self,
urlName,
- recursionLevel,
- parentName,
- baseRef, line, _time)
+ recursionLevel,
+ parentName=parentName,
+ baseRef=baseRef, line=line)
if not parentName and not baseRef and \
not re.compile("^file:").search(self.urlName):
winre = re.compile("^[a-zA-Z]:")
diff --git a/linkcheck/HostCheckingUrlData.py b/linkcheck/HostCheckingUrlData.py
index f8448206..1411ef42 100644
--- a/linkcheck/HostCheckingUrlData.py
+++ b/linkcheck/HostCheckingUrlData.py
@@ -8,9 +8,9 @@ class HostCheckingUrlData(UrlData):
urlName,
recursionLevel,
parentName = None,
- baseRef = None, line=0, _time=0):
- UrlData.__init__(self, urlName, recursionLevel, parentName, baseRef,
- line, _time)
+ baseRef = None, line=0):
+ UrlData.__init__(self, urlName, recursionLevel,
+ parentName=parentName, baseRef=baseRef, line=line)
self.host = None
self.url = urlName
diff --git a/linkcheck/MailtoUrlData.py b/linkcheck/MailtoUrlData.py
index 527a8ef9..b9cc3b85 100644
--- a/linkcheck/MailtoUrlData.py
+++ b/linkcheck/MailtoUrlData.py
@@ -1,14 +1,17 @@
import re,socket,string,DNS,sys
from HostCheckingUrlData import HostCheckingUrlData
from smtplib import SMTP
+from UrlData import LinkCheckException
+mailto_re = re.compile("^mailto:"
+ "([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)$")
class MailtoUrlData(HostCheckingUrlData):
"Url link with mailto scheme"
def buildUrl(self):
HostCheckingUrlData.buildUrl(self)
- if not re.compile("^mailto:([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)").match(self.urlName):
- raise Exception, "Illegal mailto link syntax"
+ if not mailto_re.match(self.urlName):
+ raise LinkCheckException, "Illegal mailto link syntax"
self.host = self.urlName[7:]
i = string.find(self.host, "<")
j = string.find(self.host, ">")
@@ -44,7 +47,8 @@ class MailtoUrlData(HostCheckingUrlData):
if smtpconnect: break
if not smtpconnect:
- self.setWarning("None of the mail hosts for "+self.host+" accepts an SMTP connection")
+ self.setWarning("None of the mail hosts for "+self.host+
+ " accepts an SMTP connection, "+value)
mxrecord = mxrecords[0][1]
else:
mxrecord = mxrecord[1]
diff --git a/linkcheck/OutputReader.py b/linkcheck/OutputReader.py
index 1e31f179..04479b7d 100644
--- a/linkcheck/OutputReader.py
+++ b/linkcheck/OutputReader.py
@@ -3,17 +3,19 @@ import UrlData
class ParseException(Exception):
pass
-
-class OutputReader:
- ws = re.compile("\s+")
- regex_realUrl = re.compile("^Real URL.+")
- regex_result = re.compile("^Result.+")
- regex_base = re.compile("^Base.+")
- regex_info = re.compile("^Info.+")
- regex_warning = re.compile("^Warning.+")
- regex_parentUrl = re.compile("^Parent URL.+")
- regex_valid = re.compile("^Valid.*")
+
+ws = re.compile("\s+")
+regex_realUrl = re.compile("^Real URL.+")
+regex_result = re.compile("^Result.+")
+regex_base = re.compile("^Base.+")
+regex_info = re.compile("^Info.+")
+regex_warning = re.compile("^Warning.+")
+regex_parentUrl = re.compile("^Parent URL.+")
+regex_valid = re.compile("^Valid.*")
+
+
+class OutputReader:
def resetState(self):
self.urlName = None
@@ -32,7 +34,7 @@ class OutputReader:
self.resetState()
while line:
- if OutputReader.ws.match(line):
+ if ws.match(line):
if self.state>=2:
#append url
urldata = UrlData.GetUrlDataFrom(self.urlName, 0,
@@ -52,19 +54,19 @@ class OutputReader:
raise ParseException, "No Real URL and Result keyword found"
self.resetState()
- elif OutputReader.regex_realUrl.match(line):
+ elif regex_realUrl.match(line):
self.state = self.state+1
self.urlName = string.strip(line[8:])
- elif OutputReader.regex_result.match(line):
+ elif regex_result.match(line):
self.state = self.state+1
self.result = string.strip(line[6:])
- elif OutputReader.regex_info.match(line):
+ elif regex_info.match(line):
self.info = string.strip(line[4:])
- elif OutputReader.regex_base.match(line):
+ elif regex_base.match(line):
self.baseRef = string.strip(line[4:])
- elif OutputReader.regex_warning.match(line):
+ elif regex_warning.match(line):
self.warning = string.strip(line[7:])
- elif OutputReader.regex_parentUrl.match(line):
+ elif regex_parentUrl.match(line):
self.parentName = string.strip(line[10:])
if ',' in self.parentName:
self.parentName,self.linenumber = string.split(self.parentName,",",1)
diff --git a/linkcheck/TelnetUrlData.py b/linkcheck/TelnetUrlData.py
index bbdf6074..7715f8d4 100644
--- a/linkcheck/TelnetUrlData.py
+++ b/linkcheck/TelnetUrlData.py
@@ -1,13 +1,16 @@
import telnetlib,re
from HostCheckingUrlData import HostCheckingUrlData
+from UrlData import LinkCheckException
+
+telnet_re = re.compile("^telnet:[\w.\-]+$")
class TelnetUrlData(HostCheckingUrlData):
"Url link with telnet scheme"
def buildUrl(self):
HostCheckingUrlData.buildUrl(self)
- if not re.compile("^telnet:[\w.\-]+").match(self.urlName):
- raise Exception, "Illegal telnet link syntax"
+ if not telnet_re.match(self.urlName):
+ raise LinkCheckException, "Illegal telnet link syntax"
self.host = string.lower(self.urlName[7:])
diff --git a/linkcheck/Threader.py b/linkcheck/Threader.py
index 260a22aa..774450b9 100644
--- a/linkcheck/Threader.py
+++ b/linkcheck/Threader.py
@@ -24,8 +24,7 @@ class Threader:
def finish(self):
self.reduceThreads()
- for t in self.threads:
- pass # dont know how to stop a thread
+ # dont know how to stop a thread
def startThread(self, callable, args):
"Generate a new thread"
diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py
index a1af381f..9192b543 100644
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@@ -7,9 +7,13 @@ LinkTags = [("a", "href"),
("body", "background"),
("frame", "src"),
("link", "href"),
- ("meta", "url"), #
+ #
+ ("meta", "url"),
("area", "href")]
+class LinkCheckException(Exception):
+ pass
+
class UrlData:
"Representing a URL with additional information like validity etc"
@@ -100,9 +104,8 @@ class UrlData:
try:
self.buildUrl()
self.extern = self._getExtern(config)
- except:
- type, value = sys.exc_info()[:2]
- self.setError(str(value))
+ except LinkCheckerException, msg:
+ self.setError(msg)
self.logMe(config)
return
@@ -143,10 +146,11 @@ class UrlData:
def closeConnection(self):
# brute force closing
- try: self.urlConnection.close()
- except: pass
- # release variable for garbage collection
- self.urlConnection = None
+ if self.urlConnection is not None:
+ try: self.urlConnection.close()
+ except: pass
+ # release variable for garbage collection
+ self.urlConnection = None
def putInCache(self, config):
cacheKey = self.getCacheKey()
@@ -276,7 +280,7 @@ from TelnetUrlData import TelnetUrlData
def GetUrlDataFrom(urlName,
recursionLevel,
parentName = None,
- baseRef = None, line = 0, _time = 0):
+ baseRef = None, line = 0):
# search for the absolute url
name=""
if urlName and ":" in urlName:
@@ -287,21 +291,21 @@ def GetUrlDataFrom(urlName,
name = string.lower(parentName)
# test scheme
if re.compile("^http:").search(name):
- return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^ftp:").search(name):
- return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^file:").search(name):
- return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^telnet:").search(name):
- return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^mailto:").search(name):
- return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^gopher:").search(name):
- return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^javascript:").search(name):
- return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^https:").search(name):
- return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
# assume local file
- return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
+ return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)