nntp: support

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@136 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2000-07-14 10:57:25 +00:00
parent dd4544ed81
commit 2ded7acf7f
5 changed files with 76 additions and 67 deletions

4
debian/changelog vendored
View file

@ -5,8 +5,10 @@ linkchecker (1.2.4) unstable; urgency=low
* fix for HTTP HEAD requests from bad/dumb servers
* CGI script fixes
* LinkChecker Online HTML pages added
* news: link support
* fixed parsing of extern? config file options
-- Bastian Kleineidam <calvin@users.sourceforge.net> Tue, 11 Jul 2000 18:15:07 +0200
-- Bastian Kleineidam <calvin@users.sourceforge.net> Fri, 14 Jul 2000 05:23:08 +0200
linkchecker (1.2.3) unstable; urgency=low

View file

@ -188,7 +188,6 @@ class Configuration(UserDict.UserDict):
self.logLock = None
self.urls = []
self.threader = None
self.connectNntp = self.connectNntp_NoThreads
self.dataLock = None
def enableThreading(self, num):
@ -217,7 +216,6 @@ class Configuration(UserDict.UserDict):
self.logLock = Lock()
self.urls = Queue.Queue(0)
self.threader = Threader.Threader(num)
self.connectNntp = self.connectNntp_Threads
self.dataLock = Lock()
def hasMoreUrls_NoThreads(self):
@ -278,41 +276,12 @@ class Configuration(UserDict.UserDict):
for log in self.data["fileoutput"]:
log.endOfOutput(linknumber=self.data['linknumber'])
def connectNntp_NoThreads(self):
if not self.data.has_key("nntp"):
self._do_connectNntp()
def connectNntp_Threads(self):
if not self.data.has_key("nntp"):
try:
self.dataLock.acquire()
self._do_connectNntp()
finally:
self.dataLock.release()
def incrementLinknumber_Threads(self):
try:
self.dataLock.acquire()
self.data['linknumber'] = self.data['linknumber'] + 1
finally:
self.dataLock.release()
def _do_connectNntp(self):
"""This is done only once per checking task."""
import nntplib
timeout = 1
while timeout:
try:
self.data["nntp"]=nntplib.NNTP(self.data["nntpserver"] or "")
timeout = 0
except nntplib.error_perm:
value = sys.exc_info()[1]
debug("NNTP: "+value+"\n")
if re.compile("^505").search(str(value)):
import whrandom
time.sleep(whrandom.randint(10,20))
else:
raise
def hasMoreUrls_Threads(self):
return not self.urls.empty()
@ -320,7 +289,7 @@ class Configuration(UserDict.UserDict):
def finished_Threads(self):
time.sleep(0.1)
self.threader.reduceThreads()
debug("finished?\n")
#debug("finished?\n")
return self.threader.finished() and self.urls.empty()
def finish_Threads(self):

View file

@ -15,10 +15,10 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
import re,string,time,nntplib,linkcheck
from HostCheckingUrlData import HostCheckingUrlData
import re,string,time,nntplib,urlparse,linkcheck
from linkcheck import _
from UrlData import ExcList
from UrlData import ExcList,UrlData
debug = linkcheck.Config.debug
ExcList.extend([nntplib.error_reply,
nntplib.error_temp,
@ -26,32 +26,65 @@ ExcList.extend([nntplib.error_reply,
nntplib.error_proto,
])
nntp_re = re.compile("^news:[\w.\-]+$")
class NntpUrlData(HostCheckingUrlData):
class NntpUrlData(UrlData):
"Url link with NNTP scheme"
def buildUrl(self):
HostCheckingUrlData.buildUrl(self)
if not nntp_re.match(self.urlName):
raise linkcheck.error, _("Illegal NNTP link syntax")
self.host = string.lower(self.urlName[5:])
# use nntp instead of news to comply with the unofficial internet
# draft of Alfred Gilman which unifies (s)news and nntp URLs
# note: we use this only internally (for parsing and caching)
if string.lower(self.urlName[:4])=='news':
self.url = 'nntp'+self.urlName[4:]
else:
self.url = self.urlName
self.urlTuple = urlparse.urlparse(self.url)
debug("DEBUG: %s\n" % `self.urlTuple`)
def checkConnection(self, config):
if not config["nntpserver"]:
self.setWarning(_("No NNTP server specified, checked only syntax"))
config.connectNntp()
nntp = config["nntp"]
resp,count,first,last,name = nntp.group(self.host)
self.setInfo(_("Group %s has %s articles, range %s to %s") % \
(name, count, first, last))
nntpserver = self.urlTuple[1] or config["nntpserver"]
if not nntpserver:
self.setWarning(_("No NNTP server specified, skipping this URL"))
return
nntp = self._connectNntp(nntpserver)
group = self.urlTuple[2]
if group[:1]=='/':
group = group[1:]
if '@' in group:
# request article
resp,number,id = nntp.stat("<"+group+">")
self.setInfo(_('Articel number %s found' % number))
else:
# split off trailing articel span
group = string.split(group,'/',1)[0]
# request group info
resp,count,first,last,name = nntp.group(self.urlTuple[2])
self.setInfo(_("Group %s has %s articles, range %s to %s") % \
(name, count, first, last))
def _connectNntp(self, nntpserver):
"""This is done only once per checking task."""
timeout = 1
while timeout:
try:
nntp=nntplib.NNTP(nntpserver)
timeout = 0
except nntplib.error_perm:
value = sys.exc_info()[1]
debug("NNTP: "+value+"\n")
if re.compile("^505").search(str(value)):
import whrandom
time.sleep(whrandom.randint(10,20))
else:
raise
return nntp
def getCacheKey(self):
return "news:"+HostCheckingUrlData.getCacheKey(self)
return self.url
def __str__(self):
return "NNTP link\n"+HostCheckingUrlData.__str__(self)
return "NNTP link\n"+self.urlName

View file

@ -18,6 +18,7 @@
import sys,re,string,urlparse,urllib,time
import Config,StringUtil,linkcheck
from linkcheck import _
debug = linkcheck.Config.debug
ExcList = [
IOError,
@ -136,7 +137,7 @@ class UrlData:
def logMe(self, config):
Config.debug("DEBUG: logging url\n")
debug("DEBUG: logging url\n")
config.incrementLinknumber()
if config["verbose"] or not self.valid or \
(self.warningString and config["warnings"]):
@ -144,11 +145,11 @@ class UrlData:
def check(self, config):
Config.debug(Config.DebugDelim+"Checking\n"+str(self)+"\n"+\
debug(Config.DebugDelim+"Checking\n"+str(self)+"\n"+\
Config.DebugDelim)
t = time.time()
# check syntax
Config.debug("DEBUG: checking syntax\n")
debug("DEBUG: checking syntax\n")
if not self.urlName or self.urlName=="":
self.setError(_("URL is null or empty"))
self.logMe(config)
@ -163,7 +164,7 @@ class UrlData:
return
# check the cache
Config.debug("DEBUG: checking cache\n")
debug("DEBUG: checking cache\n")
if config.urlCache_has_key(self.getCacheKey()):
self.copyFrom(config.urlCache_get(self.getCacheKey()))
self.cached = 1
@ -171,14 +172,14 @@ class UrlData:
return
# apply filter
Config.debug("DEBUG: checking filter\n")
debug("DEBUG: checking filter\n")
if self.extern and (config["strict"] or self.extern[1]):
self.setWarning(_("outside of domain filter, checked only syntax"))
self.logMe(config)
return
# check connection
Config.debug("DEBUG: checking connection\n")
debug("DEBUG: checking connection\n")
try:
self.checkConnection(config)
if self.urlTuple and config["anchors"]:
@ -190,12 +191,12 @@ class UrlData:
# check content
warningregex = config["warningregex"]
if warningregex and self.valid:
Config.debug("DEBUG: checking content\n")
debug("DEBUG: checking content\n")
self.checkContent(warningregex)
self.checktime = time.time() - t
# check recursion
Config.debug("DEBUG: checking recursion\n")
debug("DEBUG: checking recursion\n")
if self.allowsRecursion(config):
self.parseUrl(config)
self.closeConnection()
@ -280,7 +281,7 @@ class UrlData:
self.data = self.urlConnection.read()
self.downloadtime = time.time() - t
self._init_html_comments()
Config.debug("DEBUG: comment spans %s\n" % self.html_comments)
debug("DEBUG: comment spans %s\n" % self.html_comments)
return self.data
@ -309,8 +310,8 @@ class UrlData:
def parseUrl(self, config):
Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\
str(self)+"\n"+Config.DebugDelim)
debug(Config.DebugDelim+"Parsing recursively into\n"+\
str(self)+"\n"+Config.DebugDelim)
# search for a possible base reference
bases = self.searchInForTag(re.compile(_linkMatcher % ("base",
"href"), re.VERBOSE))
@ -397,7 +398,7 @@ def GetUrlDataFrom(urlName,
return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.search("^https:", name):
return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.search("^news:", name):
if re.search("^(s?news|nntp):", name):
return NntpUrlData(urlName, recursionLevel, parentName, baseRef, line)
# assume local file
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)

View file

@ -1,6 +1,10 @@
<a href="news:comp.os.linux.misc">
<a href="news:de.comp.os.unix.linux.misc">
<a href="news:comp.lang.python">
<a href="news:ist.garantiert.nix">
<a href="snews:de.comp.os.unix.linux.misc">
<a href="news:">
<a href="news:§$%&/´`(§%">
<a href="nntp://news.rz.uni-sb.de/comp.lang.python">
<a href="nntp://news.rz.uni-sb.de/comp.lang.python/1-5">
<a href="nntp://news.rz.uni-sb.de/EFGJG4.7A@deshaw.com">
<a href="nntp://news.rz.uni-sb.de/">
<a href="news:comp.lang.python/1-5">