nntp: support

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@136 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-24 14:13:43 +00:00 · 2000-07-14 10:57:25 +00:00 · 2000-07-14 10:57:25 +00:00 · 2ded7acf7f
commit 2ded7acf7f
parent dd4544ed81
5 changed files with 76 additions and 67 deletions
--- a/debian/changelog
+++ b/debian/changelog
@ -5,8 +5,10 @@ linkchecker (1.2.4) unstable; urgency=low
  * fix for HTTP HEAD requests from bad/dumb servers
  * CGI script fixes
  * LinkChecker Online HTML pages added
+  * news: link support
+  * fixed parsing of extern? config file options

- -- Bastian Kleineidam <calvin@users.sourceforge.net>  Tue, 11 Jul 2000 18:15:07 +0200
+ -- Bastian Kleineidam <calvin@users.sourceforge.net>  Fri, 14 Jul 2000 05:23:08 +0200

 linkchecker (1.2.3) unstable; urgency=low

--- a/linkcheck/Config.py
+++ b/linkcheck/Config.py
@ -188,7 +188,6 @@ class Configuration(UserDict.UserDict):
        self.logLock = None
        self.urls = []
        self.threader = None
-        self.connectNntp = self.connectNntp_NoThreads
        self.dataLock = None

    def enableThreading(self, num):
@ -217,7 +216,6 @@ class Configuration(UserDict.UserDict):
        self.logLock = Lock()
        self.urls = Queue.Queue(0)
        self.threader = Threader.Threader(num)
-        self.connectNntp = self.connectNntp_Threads
        self.dataLock = Lock()

    def hasMoreUrls_NoThreads(self):
@ -278,41 +276,12 @@ class Configuration(UserDict.UserDict):
        for log in self.data["fileoutput"]:
            log.endOfOutput(linknumber=self.data['linknumber'])

-    def connectNntp_NoThreads(self):
-        if not self.data.has_key("nntp"):
-            self._do_connectNntp()
-
-    def connectNntp_Threads(self):
-        if not self.data.has_key("nntp"):
-            try:
-                self.dataLock.acquire()
-                self._do_connectNntp()
-            finally:
-                self.dataLock.release()
-
    def incrementLinknumber_Threads(self):
        try:
            self.dataLock.acquire()
            self.data['linknumber'] = self.data['linknumber'] + 1
        finally:
            self.dataLock.release()
-    
-    def _do_connectNntp(self):
-        """This is done only once per checking task."""
-        import nntplib
-        timeout = 1
-        while timeout:
-            try:
-                self.data["nntp"]=nntplib.NNTP(self.data["nntpserver"] or "")
-                timeout = 0
-            except nntplib.error_perm:
-                value = sys.exc_info()[1]
-                debug("NNTP: "+value+"\n")
-                if re.compile("^505").search(str(value)):
-                    import whrandom
-                    time.sleep(whrandom.randint(10,20))
-                else:
-                    raise

    def hasMoreUrls_Threads(self):
        return not self.urls.empty()
@ -320,7 +289,7 @@ class Configuration(UserDict.UserDict):
    def finished_Threads(self):
        time.sleep(0.1)
        self.threader.reduceThreads()
-        debug("finished?\n")
+        #debug("finished?\n")
        return self.threader.finished() and self.urls.empty()

    def finish_Threads(self):
--- a/linkcheck/NntpUrlData.py
+++ b/linkcheck/NntpUrlData.py
@ -15,10 +15,10 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 """
-import re,string,time,nntplib,linkcheck
-from HostCheckingUrlData import HostCheckingUrlData
+import re,string,time,nntplib,urlparse,linkcheck
 from linkcheck import _
-from UrlData import ExcList
+from UrlData import ExcList,UrlData
+debug = linkcheck.Config.debug

 ExcList.extend([nntplib.error_reply,
               nntplib.error_temp,
@ -26,32 +26,65 @@ ExcList.extend([nntplib.error_reply,
               nntplib.error_proto,
               ])

-nntp_re =  re.compile("^news:[\w.\-]+$")
-
-class NntpUrlData(HostCheckingUrlData):
+class NntpUrlData(UrlData):
    "Url link with NNTP scheme"
    
    def buildUrl(self):
-        HostCheckingUrlData.buildUrl(self)
-        if not nntp_re.match(self.urlName):
-            raise linkcheck.error, _("Illegal NNTP link syntax")
-        self.host = string.lower(self.urlName[5:])
+        # use nntp instead of news to comply with the unofficial internet
+	# draft of Alfred Gilman which unifies (s)news and nntp URLs
+        # note: we use this only internally (for parsing and caching)
+        if string.lower(self.urlName[:4])=='news':
+            self.url = 'nntp'+self.urlName[4:]
+        else:
+            self.url = self.urlName
+        self.urlTuple = urlparse.urlparse(self.url)
+        debug("DEBUG: %s\n" % `self.urlTuple`)


    def checkConnection(self, config):
-        if not config["nntpserver"]:
-            self.setWarning(_("No NNTP server specified, checked only syntax"))
-        config.connectNntp()
-        nntp = config["nntp"]
-        resp,count,first,last,name = nntp.group(self.host)
-        self.setInfo(_("Group %s has %s articles, range %s to %s") % \
-                     (name, count, first, last))
+        nntpserver = self.urlTuple[1] or config["nntpserver"]
+        if not nntpserver:
+            self.setWarning(_("No NNTP server specified, skipping this URL"))
+            return
+        nntp = self._connectNntp(nntpserver)
+        group = self.urlTuple[2]
+        if group[:1]=='/':
+            group = group[1:]
+        if '@' in group:
+            # request article
+            resp,number,id = nntp.stat("<"+group+">")
+            self.setInfo(_('Articel number %s found' % number))
+        else:
+            # split off trailing articel span
+            group = string.split(group,'/',1)[0]
+            # request group info
+            resp,count,first,last,name = nntp.group(self.urlTuple[2])
+            self.setInfo(_("Group %s has %s articles, range %s to %s") % \
+                         (name, count, first, last))
+
+
+    def _connectNntp(self, nntpserver):
+        """This is done only once per checking task."""
+        timeout = 1
+        while timeout:
+            try:
+                nntp=nntplib.NNTP(nntpserver)
+                timeout = 0
+            except nntplib.error_perm:
+                value = sys.exc_info()[1]
+                debug("NNTP: "+value+"\n")
+                if re.compile("^505").search(str(value)):
+                    import whrandom
+                    time.sleep(whrandom.randint(10,20))
+                else:
+                    raise
+        return nntp


    def getCacheKey(self):
-        return "news:"+HostCheckingUrlData.getCacheKey(self)
+        return self.url


    def __str__(self):
-        return "NNTP link\n"+HostCheckingUrlData.__str__(self)
+        return "NNTP link\n"+self.urlName

--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -18,6 +18,7 @@
 import sys,re,string,urlparse,urllib,time
 import Config,StringUtil,linkcheck
 from linkcheck import _
+debug = linkcheck.Config.debug

 ExcList = [
   IOError,
@ -136,7 +137,7 @@ class UrlData:


    def logMe(self, config):
-        Config.debug("DEBUG: logging url\n")
+        debug("DEBUG: logging url\n")
        config.incrementLinknumber()
        if config["verbose"] or not self.valid or \
           (self.warningString and config["warnings"]):
@ -144,11 +145,11 @@ class UrlData:


    def check(self, config):
-        Config.debug(Config.DebugDelim+"Checking\n"+str(self)+"\n"+\
+        debug(Config.DebugDelim+"Checking\n"+str(self)+"\n"+\
                     Config.DebugDelim)
        t = time.time()
        # check syntax
-        Config.debug("DEBUG: checking syntax\n")
+        debug("DEBUG: checking syntax\n")
        if not self.urlName or self.urlName=="":
            self.setError(_("URL is null or empty"))
            self.logMe(config)
@ -163,7 +164,7 @@ class UrlData:
            return

        # check the cache
-        Config.debug("DEBUG: checking cache\n")
+        debug("DEBUG: checking cache\n")
        if config.urlCache_has_key(self.getCacheKey()):
            self.copyFrom(config.urlCache_get(self.getCacheKey()))
            self.cached = 1
@ -171,14 +172,14 @@ class UrlData:
            return
        
        # apply filter
-        Config.debug("DEBUG: checking filter\n")
+        debug("DEBUG: checking filter\n")
        if self.extern and (config["strict"] or self.extern[1]):
            self.setWarning(_("outside of domain filter, checked only syntax"))
            self.logMe(config)
            return

        # check connection
-        Config.debug("DEBUG: checking connection\n")
+        debug("DEBUG: checking connection\n")
        try:
            self.checkConnection(config)
            if self.urlTuple and config["anchors"]:
@ -190,12 +191,12 @@ class UrlData:
        # check content
        warningregex = config["warningregex"]
        if warningregex and self.valid:
-            Config.debug("DEBUG: checking content\n")
+            debug("DEBUG: checking content\n")
            self.checkContent(warningregex)

        self.checktime = time.time() - t
        # check recursion
-        Config.debug("DEBUG: checking recursion\n")
+        debug("DEBUG: checking recursion\n")
        if self.allowsRecursion(config):
            self.parseUrl(config)
        self.closeConnection()
@ -280,7 +281,7 @@ class UrlData:
            self.data = self.urlConnection.read()
            self.downloadtime = time.time() - t
            self._init_html_comments()
-            Config.debug("DEBUG: comment spans %s\n" % self.html_comments)
+            debug("DEBUG: comment spans %s\n" % self.html_comments)
        return self.data


@ -309,8 +310,8 @@ class UrlData:


    def parseUrl(self, config):
-        Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\
-                         str(self)+"\n"+Config.DebugDelim)
+        debug(Config.DebugDelim+"Parsing recursively into\n"+\
+              str(self)+"\n"+Config.DebugDelim)
        # search for a possible base reference
        bases = self.searchInForTag(re.compile(_linkMatcher % ("base",
 	        "href"), re.VERBOSE))
@ -397,7 +398,7 @@ def GetUrlDataFrom(urlName,
        return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
    if re.search("^https:", name):
        return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.search("^news:", name):
+    if re.search("^(s?news|nntp):", name):
        return NntpUrlData(urlName, recursionLevel, parentName, baseRef, line)
    # assume local file
    return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
--- a/test/news.html
+++ b/test/news.html
@ -1,6 +1,10 @@
 <a href="news:comp.os.linux.misc">
-<a href="news:de.comp.os.unix.linux.misc">
-<a href="news:comp.lang.python">
-<a href="news:ist.garantiert.nix">
+<a href="snews:de.comp.os.unix.linux.misc">
 <a href="news:">
 <a href="news:§$%&/´`(§%">
+<a href="nntp://news.rz.uni-sb.de/comp.lang.python">
+<a href="nntp://news.rz.uni-sb.de/comp.lang.python/1-5">
+<a href="nntp://news.rz.uni-sb.de/EFGJG4.7A@deshaw.com">
+<a href="nntp://news.rz.uni-sb.de/">
+<a href="news:comp.lang.python/1-5">
+