minor glitches

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@33 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2000-03-21 11:38:22 +00:00
parent d5360ad526
commit 005d606421
9 changed files with 81 additions and 68 deletions

View file

@ -1,6 +1,7 @@
VERSION=$(shell ./setup.py -V)
HOST=treasure.calvinsplayground.de
PROXY=-P$(HOST):5050
PROXY=
#PROXY=-P$(HOST):5050
#HOST=fsinfo.cs.uni-sb.de
#PROXY=-Pwww-proxy.uni-sb.de:3128
PACKAGE = linkchecker

View file

@ -58,9 +58,9 @@ class Configuration(UserDict.UserDict):
self.urlCache = {}
self.robotsTxtCache = {}
try:
from threading import *
import threading
self.enableThreading(5)
except:
except ImportError:
type, value = sys.exc_info()[:2]
self.disableThreading()
@ -243,7 +243,7 @@ class Configuration(UserDict.UserDict):
try:
cfgparser = ConfigParser.ConfigParser()
cfgparser.read(files)
except:
except ConfigParser.Error:
return
section="output"
@ -253,16 +253,16 @@ class Configuration(UserDict.UserDict):
self.data["log"] = Loggers[log]()
else:
self.warn("invalid log option "+log)
except: pass
except ConfigParser.Error: pass
try:
if cfgparser.getboolean(section, "verbose"):
self.data["verbose"] = 1
self.data["warnings"] = 1
except: pass
except ConfigParser.Error: pass
try: self.data["quiet"] = cfgparser.getboolean(section, "quiet")
except: pass
except ConfigParser.Error: pass
try: self.data["warnings"] = cfgparser.getboolean(section, "warnings")
except: pass
except ConfigParser.Error: pass
section="checking"
try:
@ -271,29 +271,29 @@ class Configuration(UserDict.UserDict):
self.disableThreads()
else:
self.enableThreads(num)
except: pass
except ConfigParser.Error: pass
try: self.data["anchors"] = cfgparser.getboolean(section, "anchors")
except: pass
except ConfigParser.Error: pass
try:
self.data["proxy"] = cfgparser.get(section, "proxy")
self.data["proxyport"] = cfgparser.getint(section, "proxyport")
except: pass
except ConfigParser.Error: pass
try:
num = cfgparser.getint(section, "recursionlevel")
if num<0:
self.error("illegal recursionlevel number: "+`num`)
self.data["recursionlevel"] = num
except: pass
except ConfigParser.Error: pass
try: self.data["robotstxt"] = cfgparser.getboolean(section, "robotstxt")
except: pass
except ConfigParser.Error: pass
try: self.data["strict"] = cfgparser.getboolean(section, "strict")
except: pass
except ConfigParser.Error: pass
try:
filelist = string.split(cfgparser.get(section, "fileoutput"))
for arg in filelist:
if Loggers.has_key(arg):
self.data["fileoutput"].append(Loggers[arg](open("pylice-out."+arg, "w")))
except: pass
except ConfigParser.Error: pass
section = "authentication"
try:
@ -304,7 +304,7 @@ class Configuration(UserDict.UserDict):
tuple[0] = re.compile(tuple[0])
self.data["authentication"].append(tuple)
i = i + 1
except: pass
except ConfigParser.Error: pass
self.data["authentication"].append((re.compile(".*"), "anonymous", "guest@"))
section = "filtering"
@ -315,9 +315,9 @@ class Configuration(UserDict.UserDict):
if len(tuple)!=2: break
self.data["externlinks"].append((re.compile(tuple[0]),
int(tuple[1])))
except: pass
except ConfigParser.Error: pass
try: self.data["internlinks"].append(re.compile(cfgparser.get(section, "internlinks")))
except: pass
except ConfigParser.Error: pass
try: self.data["allowdeny"] = cfgparser.getboolean(section, "allowdeny")
except: pass
except ConfigParser.Error: pass

View file

@ -9,12 +9,12 @@ class FileUrlData(UrlData):
urlName,
recursionLevel,
parentName = None,
baseRef = None, line=0, _time=0):
baseRef = None, line=0):
UrlData.__init__(self,
urlName,
recursionLevel,
parentName,
baseRef, line, _time)
recursionLevel,
parentName=parentName,
baseRef=baseRef, line=line)
if not parentName and not baseRef and \
not re.compile("^file:").search(self.urlName):
winre = re.compile("^[a-zA-Z]:")

View file

@ -8,9 +8,9 @@ class HostCheckingUrlData(UrlData):
urlName,
recursionLevel,
parentName = None,
baseRef = None, line=0, _time=0):
UrlData.__init__(self, urlName, recursionLevel, parentName, baseRef,
line, _time)
baseRef = None, line=0):
UrlData.__init__(self, urlName, recursionLevel,
parentName=parentName, baseRef=baseRef, line=line)
self.host = None
self.url = urlName

View file

@ -1,14 +1,17 @@
import re,socket,string,DNS,sys
from HostCheckingUrlData import HostCheckingUrlData
from smtplib import SMTP
from UrlData import LinkCheckException
mailto_re = re.compile("^mailto:"
"([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)$")
class MailtoUrlData(HostCheckingUrlData):
"Url link with mailto scheme"
def buildUrl(self):
HostCheckingUrlData.buildUrl(self)
if not re.compile("^mailto:([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)").match(self.urlName):
raise Exception, "Illegal mailto link syntax"
if not mailto_re.match(self.urlName):
raise LinkCheckException, "Illegal mailto link syntax"
self.host = self.urlName[7:]
i = string.find(self.host, "<")
j = string.find(self.host, ">")
@ -44,7 +47,8 @@ class MailtoUrlData(HostCheckingUrlData):
if smtpconnect: break
if not smtpconnect:
self.setWarning("None of the mail hosts for "+self.host+" accepts an SMTP connection")
self.setWarning("None of the mail hosts for "+self.host+
" accepts an SMTP connection, "+value)
mxrecord = mxrecords[0][1]
else:
mxrecord = mxrecord[1]

View file

@ -3,17 +3,19 @@ import UrlData
class ParseException(Exception):
pass
class OutputReader:
ws = re.compile("\s+")
regex_realUrl = re.compile("^Real URL.+")
regex_result = re.compile("^Result.+")
regex_base = re.compile("^Base.+")
regex_info = re.compile("^Info.+")
regex_warning = re.compile("^Warning.+")
regex_parentUrl = re.compile("^Parent URL.+")
regex_valid = re.compile("^Valid.*")
ws = re.compile("\s+")
regex_realUrl = re.compile("^Real URL.+")
regex_result = re.compile("^Result.+")
regex_base = re.compile("^Base.+")
regex_info = re.compile("^Info.+")
regex_warning = re.compile("^Warning.+")
regex_parentUrl = re.compile("^Parent URL.+")
regex_valid = re.compile("^Valid.*")
class OutputReader:
def resetState(self):
self.urlName = None
@ -32,7 +34,7 @@ class OutputReader:
self.resetState()
while line:
if OutputReader.ws.match(line):
if ws.match(line):
if self.state>=2:
#append url
urldata = UrlData.GetUrlDataFrom(self.urlName, 0,
@ -52,19 +54,19 @@ class OutputReader:
raise ParseException, "No Real URL and Result keyword found"
self.resetState()
elif OutputReader.regex_realUrl.match(line):
elif regex_realUrl.match(line):
self.state = self.state+1
self.urlName = string.strip(line[8:])
elif OutputReader.regex_result.match(line):
elif regex_result.match(line):
self.state = self.state+1
self.result = string.strip(line[6:])
elif OutputReader.regex_info.match(line):
elif regex_info.match(line):
self.info = string.strip(line[4:])
elif OutputReader.regex_base.match(line):
elif regex_base.match(line):
self.baseRef = string.strip(line[4:])
elif OutputReader.regex_warning.match(line):
elif regex_warning.match(line):
self.warning = string.strip(line[7:])
elif OutputReader.regex_parentUrl.match(line):
elif regex_parentUrl.match(line):
self.parentName = string.strip(line[10:])
if ',' in self.parentName:
self.parentName,self.linenumber = string.split(self.parentName,",",1)

View file

@ -1,13 +1,16 @@
import telnetlib,re
from HostCheckingUrlData import HostCheckingUrlData
from UrlData import LinkCheckException
telnet_re = re.compile("^telnet:[\w.\-]+$")
class TelnetUrlData(HostCheckingUrlData):
"Url link with telnet scheme"
def buildUrl(self):
HostCheckingUrlData.buildUrl(self)
if not re.compile("^telnet:[\w.\-]+").match(self.urlName):
raise Exception, "Illegal telnet link syntax"
if not telnet_re.match(self.urlName):
raise LinkCheckException, "Illegal telnet link syntax"
self.host = string.lower(self.urlName[7:])

View file

@ -24,8 +24,7 @@ class Threader:
def finish(self):
self.reduceThreads()
for t in self.threads:
pass # dont know how to stop a thread
# dont know how to stop a thread
def startThread(self, callable, args):
"Generate a new thread"

View file

@ -7,9 +7,13 @@ LinkTags = [("a", "href"),
("body", "background"),
("frame", "src"),
("link", "href"),
("meta", "url"), # <meta http-equiv="refresh" content="5; url=...">
# <meta http-equiv="refresh" content="5; url=...">
("meta", "url"),
("area", "href")]
class LinkCheckException(Exception):
pass
class UrlData:
"Representing a URL with additional information like validity etc"
@ -100,9 +104,8 @@ class UrlData:
try:
self.buildUrl()
self.extern = self._getExtern(config)
except:
type, value = sys.exc_info()[:2]
self.setError(str(value))
except LinkCheckerException, msg:
self.setError(msg)
self.logMe(config)
return
@ -143,10 +146,11 @@ class UrlData:
def closeConnection(self):
# brute force closing
try: self.urlConnection.close()
except: pass
# release variable for garbage collection
self.urlConnection = None
if self.urlConnection is not None:
try: self.urlConnection.close()
except: pass
# release variable for garbage collection
self.urlConnection = None
def putInCache(self, config):
cacheKey = self.getCacheKey()
@ -276,7 +280,7 @@ from TelnetUrlData import TelnetUrlData
def GetUrlDataFrom(urlName,
recursionLevel,
parentName = None,
baseRef = None, line = 0, _time = 0):
baseRef = None, line = 0):
# search for the absolute url
name=""
if urlName and ":" in urlName:
@ -287,21 +291,21 @@ def GetUrlDataFrom(urlName,
name = string.lower(parentName)
# test scheme
if re.compile("^http:").search(name):
return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^ftp:").search(name):
return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^file:").search(name):
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^telnet:").search(name):
return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^mailto:").search(name):
return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^gopher:").search(name):
return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^javascript:").search(name):
return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^https:").search(name):
return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
# assume local file
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)