mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-26 18:50:32 +00:00
minor glitches
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@33 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
d5360ad526
commit
005d606421
9 changed files with 81 additions and 68 deletions
3
Makefile
3
Makefile
|
|
@ -1,6 +1,7 @@
|
|||
VERSION=$(shell ./setup.py -V)
|
||||
HOST=treasure.calvinsplayground.de
|
||||
PROXY=-P$(HOST):5050
|
||||
PROXY=
|
||||
#PROXY=-P$(HOST):5050
|
||||
#HOST=fsinfo.cs.uni-sb.de
|
||||
#PROXY=-Pwww-proxy.uni-sb.de:3128
|
||||
PACKAGE = linkchecker
|
||||
|
|
|
|||
|
|
@ -58,9 +58,9 @@ class Configuration(UserDict.UserDict):
|
|||
self.urlCache = {}
|
||||
self.robotsTxtCache = {}
|
||||
try:
|
||||
from threading import *
|
||||
import threading
|
||||
self.enableThreading(5)
|
||||
except:
|
||||
except ImportError:
|
||||
type, value = sys.exc_info()[:2]
|
||||
self.disableThreading()
|
||||
|
||||
|
|
@ -243,7 +243,7 @@ class Configuration(UserDict.UserDict):
|
|||
try:
|
||||
cfgparser = ConfigParser.ConfigParser()
|
||||
cfgparser.read(files)
|
||||
except:
|
||||
except ConfigParser.Error:
|
||||
return
|
||||
|
||||
section="output"
|
||||
|
|
@ -253,16 +253,16 @@ class Configuration(UserDict.UserDict):
|
|||
self.data["log"] = Loggers[log]()
|
||||
else:
|
||||
self.warn("invalid log option "+log)
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try:
|
||||
if cfgparser.getboolean(section, "verbose"):
|
||||
self.data["verbose"] = 1
|
||||
self.data["warnings"] = 1
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try: self.data["quiet"] = cfgparser.getboolean(section, "quiet")
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try: self.data["warnings"] = cfgparser.getboolean(section, "warnings")
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
|
||||
section="checking"
|
||||
try:
|
||||
|
|
@ -271,29 +271,29 @@ class Configuration(UserDict.UserDict):
|
|||
self.disableThreads()
|
||||
else:
|
||||
self.enableThreads(num)
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try: self.data["anchors"] = cfgparser.getboolean(section, "anchors")
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try:
|
||||
self.data["proxy"] = cfgparser.get(section, "proxy")
|
||||
self.data["proxyport"] = cfgparser.getint(section, "proxyport")
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try:
|
||||
num = cfgparser.getint(section, "recursionlevel")
|
||||
if num<0:
|
||||
self.error("illegal recursionlevel number: "+`num`)
|
||||
self.data["recursionlevel"] = num
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try: self.data["robotstxt"] = cfgparser.getboolean(section, "robotstxt")
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try: self.data["strict"] = cfgparser.getboolean(section, "strict")
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try:
|
||||
filelist = string.split(cfgparser.get(section, "fileoutput"))
|
||||
for arg in filelist:
|
||||
if Loggers.has_key(arg):
|
||||
self.data["fileoutput"].append(Loggers[arg](open("pylice-out."+arg, "w")))
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
|
||||
section = "authentication"
|
||||
try:
|
||||
|
|
@ -304,7 +304,7 @@ class Configuration(UserDict.UserDict):
|
|||
tuple[0] = re.compile(tuple[0])
|
||||
self.data["authentication"].append(tuple)
|
||||
i = i + 1
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
self.data["authentication"].append((re.compile(".*"), "anonymous", "guest@"))
|
||||
|
||||
section = "filtering"
|
||||
|
|
@ -315,9 +315,9 @@ class Configuration(UserDict.UserDict):
|
|||
if len(tuple)!=2: break
|
||||
self.data["externlinks"].append((re.compile(tuple[0]),
|
||||
int(tuple[1])))
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try: self.data["internlinks"].append(re.compile(cfgparser.get(section, "internlinks")))
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
try: self.data["allowdeny"] = cfgparser.getboolean(section, "allowdeny")
|
||||
except: pass
|
||||
except ConfigParser.Error: pass
|
||||
|
||||
|
|
|
|||
|
|
@ -9,12 +9,12 @@ class FileUrlData(UrlData):
|
|||
urlName,
|
||||
recursionLevel,
|
||||
parentName = None,
|
||||
baseRef = None, line=0, _time=0):
|
||||
baseRef = None, line=0):
|
||||
UrlData.__init__(self,
|
||||
urlName,
|
||||
recursionLevel,
|
||||
parentName,
|
||||
baseRef, line, _time)
|
||||
recursionLevel,
|
||||
parentName=parentName,
|
||||
baseRef=baseRef, line=line)
|
||||
if not parentName and not baseRef and \
|
||||
not re.compile("^file:").search(self.urlName):
|
||||
winre = re.compile("^[a-zA-Z]:")
|
||||
|
|
|
|||
|
|
@ -8,9 +8,9 @@ class HostCheckingUrlData(UrlData):
|
|||
urlName,
|
||||
recursionLevel,
|
||||
parentName = None,
|
||||
baseRef = None, line=0, _time=0):
|
||||
UrlData.__init__(self, urlName, recursionLevel, parentName, baseRef,
|
||||
line, _time)
|
||||
baseRef = None, line=0):
|
||||
UrlData.__init__(self, urlName, recursionLevel,
|
||||
parentName=parentName, baseRef=baseRef, line=line)
|
||||
self.host = None
|
||||
self.url = urlName
|
||||
|
||||
|
|
|
|||
|
|
@ -1,14 +1,17 @@
|
|||
import re,socket,string,DNS,sys
|
||||
from HostCheckingUrlData import HostCheckingUrlData
|
||||
from smtplib import SMTP
|
||||
from UrlData import LinkCheckException
|
||||
|
||||
mailto_re = re.compile("^mailto:"
|
||||
"([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)$")
|
||||
class MailtoUrlData(HostCheckingUrlData):
|
||||
"Url link with mailto scheme"
|
||||
|
||||
def buildUrl(self):
|
||||
HostCheckingUrlData.buildUrl(self)
|
||||
if not re.compile("^mailto:([\-\w.]+@[\-\w.?=]+|[\w\s]+<[\-\w.]+@[\-\w.?=]+>)").match(self.urlName):
|
||||
raise Exception, "Illegal mailto link syntax"
|
||||
if not mailto_re.match(self.urlName):
|
||||
raise LinkCheckException, "Illegal mailto link syntax"
|
||||
self.host = self.urlName[7:]
|
||||
i = string.find(self.host, "<")
|
||||
j = string.find(self.host, ">")
|
||||
|
|
@ -44,7 +47,8 @@ class MailtoUrlData(HostCheckingUrlData):
|
|||
if smtpconnect: break
|
||||
|
||||
if not smtpconnect:
|
||||
self.setWarning("None of the mail hosts for "+self.host+" accepts an SMTP connection")
|
||||
self.setWarning("None of the mail hosts for "+self.host+
|
||||
" accepts an SMTP connection, "+value)
|
||||
mxrecord = mxrecords[0][1]
|
||||
else:
|
||||
mxrecord = mxrecord[1]
|
||||
|
|
|
|||
|
|
@ -3,17 +3,19 @@ import UrlData
|
|||
|
||||
class ParseException(Exception):
|
||||
pass
|
||||
|
||||
class OutputReader:
|
||||
|
||||
ws = re.compile("\s+")
|
||||
regex_realUrl = re.compile("^Real URL.+")
|
||||
regex_result = re.compile("^Result.+")
|
||||
regex_base = re.compile("^Base.+")
|
||||
regex_info = re.compile("^Info.+")
|
||||
regex_warning = re.compile("^Warning.+")
|
||||
regex_parentUrl = re.compile("^Parent URL.+")
|
||||
regex_valid = re.compile("^Valid.*")
|
||||
|
||||
ws = re.compile("\s+")
|
||||
regex_realUrl = re.compile("^Real URL.+")
|
||||
regex_result = re.compile("^Result.+")
|
||||
regex_base = re.compile("^Base.+")
|
||||
regex_info = re.compile("^Info.+")
|
||||
regex_warning = re.compile("^Warning.+")
|
||||
regex_parentUrl = re.compile("^Parent URL.+")
|
||||
regex_valid = re.compile("^Valid.*")
|
||||
|
||||
|
||||
class OutputReader:
|
||||
|
||||
def resetState(self):
|
||||
self.urlName = None
|
||||
|
|
@ -32,7 +34,7 @@ class OutputReader:
|
|||
self.resetState()
|
||||
|
||||
while line:
|
||||
if OutputReader.ws.match(line):
|
||||
if ws.match(line):
|
||||
if self.state>=2:
|
||||
#append url
|
||||
urldata = UrlData.GetUrlDataFrom(self.urlName, 0,
|
||||
|
|
@ -52,19 +54,19 @@ class OutputReader:
|
|||
raise ParseException, "No Real URL and Result keyword found"
|
||||
self.resetState()
|
||||
|
||||
elif OutputReader.regex_realUrl.match(line):
|
||||
elif regex_realUrl.match(line):
|
||||
self.state = self.state+1
|
||||
self.urlName = string.strip(line[8:])
|
||||
elif OutputReader.regex_result.match(line):
|
||||
elif regex_result.match(line):
|
||||
self.state = self.state+1
|
||||
self.result = string.strip(line[6:])
|
||||
elif OutputReader.regex_info.match(line):
|
||||
elif regex_info.match(line):
|
||||
self.info = string.strip(line[4:])
|
||||
elif OutputReader.regex_base.match(line):
|
||||
elif regex_base.match(line):
|
||||
self.baseRef = string.strip(line[4:])
|
||||
elif OutputReader.regex_warning.match(line):
|
||||
elif regex_warning.match(line):
|
||||
self.warning = string.strip(line[7:])
|
||||
elif OutputReader.regex_parentUrl.match(line):
|
||||
elif regex_parentUrl.match(line):
|
||||
self.parentName = string.strip(line[10:])
|
||||
if ',' in self.parentName:
|
||||
self.parentName,self.linenumber = string.split(self.parentName,",",1)
|
||||
|
|
|
|||
|
|
@ -1,13 +1,16 @@
|
|||
import telnetlib,re
|
||||
from HostCheckingUrlData import HostCheckingUrlData
|
||||
from UrlData import LinkCheckException
|
||||
|
||||
telnet_re = re.compile("^telnet:[\w.\-]+$")
|
||||
|
||||
class TelnetUrlData(HostCheckingUrlData):
|
||||
"Url link with telnet scheme"
|
||||
|
||||
def buildUrl(self):
|
||||
HostCheckingUrlData.buildUrl(self)
|
||||
if not re.compile("^telnet:[\w.\-]+").match(self.urlName):
|
||||
raise Exception, "Illegal telnet link syntax"
|
||||
if not telnet_re.match(self.urlName):
|
||||
raise LinkCheckException, "Illegal telnet link syntax"
|
||||
self.host = string.lower(self.urlName[7:])
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -24,8 +24,7 @@ class Threader:
|
|||
|
||||
def finish(self):
|
||||
self.reduceThreads()
|
||||
for t in self.threads:
|
||||
pass # dont know how to stop a thread
|
||||
# dont know how to stop a thread
|
||||
|
||||
def startThread(self, callable, args):
|
||||
"Generate a new thread"
|
||||
|
|
|
|||
|
|
@ -7,9 +7,13 @@ LinkTags = [("a", "href"),
|
|||
("body", "background"),
|
||||
("frame", "src"),
|
||||
("link", "href"),
|
||||
("meta", "url"), # <meta http-equiv="refresh" content="5; url=...">
|
||||
# <meta http-equiv="refresh" content="5; url=...">
|
||||
("meta", "url"),
|
||||
("area", "href")]
|
||||
|
||||
class LinkCheckException(Exception):
|
||||
pass
|
||||
|
||||
class UrlData:
|
||||
"Representing a URL with additional information like validity etc"
|
||||
|
||||
|
|
@ -100,9 +104,8 @@ class UrlData:
|
|||
try:
|
||||
self.buildUrl()
|
||||
self.extern = self._getExtern(config)
|
||||
except:
|
||||
type, value = sys.exc_info()[:2]
|
||||
self.setError(str(value))
|
||||
except LinkCheckerException, msg:
|
||||
self.setError(msg)
|
||||
self.logMe(config)
|
||||
return
|
||||
|
||||
|
|
@ -143,10 +146,11 @@ class UrlData:
|
|||
|
||||
def closeConnection(self):
|
||||
# brute force closing
|
||||
try: self.urlConnection.close()
|
||||
except: pass
|
||||
# release variable for garbage collection
|
||||
self.urlConnection = None
|
||||
if self.urlConnection is not None:
|
||||
try: self.urlConnection.close()
|
||||
except: pass
|
||||
# release variable for garbage collection
|
||||
self.urlConnection = None
|
||||
|
||||
def putInCache(self, config):
|
||||
cacheKey = self.getCacheKey()
|
||||
|
|
@ -276,7 +280,7 @@ from TelnetUrlData import TelnetUrlData
|
|||
def GetUrlDataFrom(urlName,
|
||||
recursionLevel,
|
||||
parentName = None,
|
||||
baseRef = None, line = 0, _time = 0):
|
||||
baseRef = None, line = 0):
|
||||
# search for the absolute url
|
||||
name=""
|
||||
if urlName and ":" in urlName:
|
||||
|
|
@ -287,21 +291,21 @@ def GetUrlDataFrom(urlName,
|
|||
name = string.lower(parentName)
|
||||
# test scheme
|
||||
if re.compile("^http:").search(name):
|
||||
return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^ftp:").search(name):
|
||||
return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^file:").search(name):
|
||||
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^telnet:").search(name):
|
||||
return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^mailto:").search(name):
|
||||
return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^gopher:").search(name):
|
||||
return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^javascript:").search(name):
|
||||
return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^https:").search(name):
|
||||
return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
# assume local file
|
||||
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line, _time)
|
||||
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue