diff --git a/debian/changelog b/debian/changelog index d737f566..03ecb064 100644 --- a/debian/changelog +++ b/debian/changelog @@ -5,8 +5,10 @@ linkchecker (1.2.4) unstable; urgency=low * fix for HTTP HEAD requests from bad/dumb servers * CGI script fixes * LinkChecker Online HTML pages added + * news: link support + * fixed parsing of extern? config file options - -- Bastian Kleineidam Tue, 11 Jul 2000 18:15:07 +0200 + -- Bastian Kleineidam Fri, 14 Jul 2000 05:23:08 +0200 linkchecker (1.2.3) unstable; urgency=low diff --git a/linkcheck/Config.py b/linkcheck/Config.py index a0f7e2d3..040289a9 100644 --- a/linkcheck/Config.py +++ b/linkcheck/Config.py @@ -188,7 +188,6 @@ class Configuration(UserDict.UserDict): self.logLock = None self.urls = [] self.threader = None - self.connectNntp = self.connectNntp_NoThreads self.dataLock = None def enableThreading(self, num): @@ -217,7 +216,6 @@ class Configuration(UserDict.UserDict): self.logLock = Lock() self.urls = Queue.Queue(0) self.threader = Threader.Threader(num) - self.connectNntp = self.connectNntp_Threads self.dataLock = Lock() def hasMoreUrls_NoThreads(self): @@ -278,41 +276,12 @@ class Configuration(UserDict.UserDict): for log in self.data["fileoutput"]: log.endOfOutput(linknumber=self.data['linknumber']) - def connectNntp_NoThreads(self): - if not self.data.has_key("nntp"): - self._do_connectNntp() - - def connectNntp_Threads(self): - if not self.data.has_key("nntp"): - try: - self.dataLock.acquire() - self._do_connectNntp() - finally: - self.dataLock.release() - def incrementLinknumber_Threads(self): try: self.dataLock.acquire() self.data['linknumber'] = self.data['linknumber'] + 1 finally: self.dataLock.release() - - def _do_connectNntp(self): - """This is done only once per checking task.""" - import nntplib - timeout = 1 - while timeout: - try: - self.data["nntp"]=nntplib.NNTP(self.data["nntpserver"] or "") - timeout = 0 - except nntplib.error_perm: - value = sys.exc_info()[1] - debug("NNTP: "+value+"\n") - if re.compile("^505").search(str(value)): - import whrandom - time.sleep(whrandom.randint(10,20)) - else: - raise def hasMoreUrls_Threads(self): return not self.urls.empty() @@ -320,7 +289,7 @@ class Configuration(UserDict.UserDict): def finished_Threads(self): time.sleep(0.1) self.threader.reduceThreads() - debug("finished?\n") + #debug("finished?\n") return self.threader.finished() and self.urls.empty() def finish_Threads(self): diff --git a/linkcheck/NntpUrlData.py b/linkcheck/NntpUrlData.py index 044b685d..b48b008d 100644 --- a/linkcheck/NntpUrlData.py +++ b/linkcheck/NntpUrlData.py @@ -15,10 +15,10 @@ along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ -import re,string,time,nntplib,linkcheck -from HostCheckingUrlData import HostCheckingUrlData +import re,string,time,nntplib,urlparse,linkcheck from linkcheck import _ -from UrlData import ExcList +from UrlData import ExcList,UrlData +debug = linkcheck.Config.debug ExcList.extend([nntplib.error_reply, nntplib.error_temp, @@ -26,32 +26,65 @@ ExcList.extend([nntplib.error_reply, nntplib.error_proto, ]) -nntp_re = re.compile("^news:[\w.\-]+$") - -class NntpUrlData(HostCheckingUrlData): +class NntpUrlData(UrlData): "Url link with NNTP scheme" def buildUrl(self): - HostCheckingUrlData.buildUrl(self) - if not nntp_re.match(self.urlName): - raise linkcheck.error, _("Illegal NNTP link syntax") - self.host = string.lower(self.urlName[5:]) + # use nntp instead of news to comply with the unofficial internet + # draft of Alfred Gilman which unifies (s)news and nntp URLs + # note: we use this only internally (for parsing and caching) + if string.lower(self.urlName[:4])=='news': + self.url = 'nntp'+self.urlName[4:] + else: + self.url = self.urlName + self.urlTuple = urlparse.urlparse(self.url) + debug("DEBUG: %s\n" % `self.urlTuple`) def checkConnection(self, config): - if not config["nntpserver"]: - self.setWarning(_("No NNTP server specified, checked only syntax")) - config.connectNntp() - nntp = config["nntp"] - resp,count,first,last,name = nntp.group(self.host) - self.setInfo(_("Group %s has %s articles, range %s to %s") % \ - (name, count, first, last)) + nntpserver = self.urlTuple[1] or config["nntpserver"] + if not nntpserver: + self.setWarning(_("No NNTP server specified, skipping this URL")) + return + nntp = self._connectNntp(nntpserver) + group = self.urlTuple[2] + if group[:1]=='/': + group = group[1:] + if '@' in group: + # request article + resp,number,id = nntp.stat("<"+group+">") + self.setInfo(_('Articel number %s found' % number)) + else: + # split off trailing articel span + group = string.split(group,'/',1)[0] + # request group info + resp,count,first,last,name = nntp.group(self.urlTuple[2]) + self.setInfo(_("Group %s has %s articles, range %s to %s") % \ + (name, count, first, last)) + + + def _connectNntp(self, nntpserver): + """This is done only once per checking task.""" + timeout = 1 + while timeout: + try: + nntp=nntplib.NNTP(nntpserver) + timeout = 0 + except nntplib.error_perm: + value = sys.exc_info()[1] + debug("NNTP: "+value+"\n") + if re.compile("^505").search(str(value)): + import whrandom + time.sleep(whrandom.randint(10,20)) + else: + raise + return nntp def getCacheKey(self): - return "news:"+HostCheckingUrlData.getCacheKey(self) + return self.url def __str__(self): - return "NNTP link\n"+HostCheckingUrlData.__str__(self) + return "NNTP link\n"+self.urlName diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index 62dc244e..e665a559 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -18,6 +18,7 @@ import sys,re,string,urlparse,urllib,time import Config,StringUtil,linkcheck from linkcheck import _ +debug = linkcheck.Config.debug ExcList = [ IOError, @@ -136,7 +137,7 @@ class UrlData: def logMe(self, config): - Config.debug("DEBUG: logging url\n") + debug("DEBUG: logging url\n") config.incrementLinknumber() if config["verbose"] or not self.valid or \ (self.warningString and config["warnings"]): @@ -144,11 +145,11 @@ class UrlData: def check(self, config): - Config.debug(Config.DebugDelim+"Checking\n"+str(self)+"\n"+\ + debug(Config.DebugDelim+"Checking\n"+str(self)+"\n"+\ Config.DebugDelim) t = time.time() # check syntax - Config.debug("DEBUG: checking syntax\n") + debug("DEBUG: checking syntax\n") if not self.urlName or self.urlName=="": self.setError(_("URL is null or empty")) self.logMe(config) @@ -163,7 +164,7 @@ class UrlData: return # check the cache - Config.debug("DEBUG: checking cache\n") + debug("DEBUG: checking cache\n") if config.urlCache_has_key(self.getCacheKey()): self.copyFrom(config.urlCache_get(self.getCacheKey())) self.cached = 1 @@ -171,14 +172,14 @@ class UrlData: return # apply filter - Config.debug("DEBUG: checking filter\n") + debug("DEBUG: checking filter\n") if self.extern and (config["strict"] or self.extern[1]): self.setWarning(_("outside of domain filter, checked only syntax")) self.logMe(config) return # check connection - Config.debug("DEBUG: checking connection\n") + debug("DEBUG: checking connection\n") try: self.checkConnection(config) if self.urlTuple and config["anchors"]: @@ -190,12 +191,12 @@ class UrlData: # check content warningregex = config["warningregex"] if warningregex and self.valid: - Config.debug("DEBUG: checking content\n") + debug("DEBUG: checking content\n") self.checkContent(warningregex) self.checktime = time.time() - t # check recursion - Config.debug("DEBUG: checking recursion\n") + debug("DEBUG: checking recursion\n") if self.allowsRecursion(config): self.parseUrl(config) self.closeConnection() @@ -280,7 +281,7 @@ class UrlData: self.data = self.urlConnection.read() self.downloadtime = time.time() - t self._init_html_comments() - Config.debug("DEBUG: comment spans %s\n" % self.html_comments) + debug("DEBUG: comment spans %s\n" % self.html_comments) return self.data @@ -309,8 +310,8 @@ class UrlData: def parseUrl(self, config): - Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\ - str(self)+"\n"+Config.DebugDelim) + debug(Config.DebugDelim+"Parsing recursively into\n"+\ + str(self)+"\n"+Config.DebugDelim) # search for a possible base reference bases = self.searchInForTag(re.compile(_linkMatcher % ("base", "href"), re.VERBOSE)) @@ -397,7 +398,7 @@ def GetUrlDataFrom(urlName, return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line) if re.search("^https:", name): return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.search("^news:", name): + if re.search("^(s?news|nntp):", name): return NntpUrlData(urlName, recursionLevel, parentName, baseRef, line) # assume local file return FileUrlData(urlName, recursionLevel, parentName, baseRef, line) diff --git a/test/news.html b/test/news.html index 6f3dbfb0..9dbbfef2 100644 --- a/test/news.html +++ b/test/news.html @@ -1,6 +1,10 @@ - - - + + + + + + +