only check robots.txt for http

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1285 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-05 13:14:46 +00:00 · 2004-04-03 16:34:58 +00:00 · 2004-04-03 16:34:58 +00:00 · 8584d5bc8e
commit 8584d5bc8e
parent 67fabd5d8e
3 changed files with 10 additions and 2 deletions
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -388,6 +388,10 @@ class HttpUrlData (ProxyUrlData):
        return True


+    def isHttp (self):
+        return True
+
+
    def isParseable (self):
        if not (self.valid and self.headers):
            return False
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -218,6 +218,10 @@ class UrlData (object):
        return False


+    def isHttp (self):
+        return False
+
+
    def setWarning (self, s):
        if self.warningString:
            self.warningString += "\n"+s
@ -455,7 +459,7 @@ class UrlData (object):


    def contentAllowsRobots (self):
-        if not self.isHtml():
+        if not self.isHttp():
            return True
        h = MetaRobotsFinder(self.getContent())
        p = htmlsax.parser(h)
--- a/linkcheck/init.py
+++ b/linkcheck/init.py
@ -88,7 +88,7 @@ def checkUrls (config):


 def printStatus (config, curtime, start_time):
-    tocheck = config.urls.qsize()
+    tocheck = len(config.urls)
    links = config['linknumber']
    active = config.threader.active_threads()
    duration = strduration(curtime - start_time)