basic robots_txt check method

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2001 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-19 22:01:00 +00:00 · 2004-11-18 00:55:47 +00:00 · 2004-11-18 00:55:47 +00:00 · 89616a4bba
commit 89616a4bba
parent 0af9a489df
2 changed files with 9 additions and 8 deletions
--- a/linkcheck/checker/cache.py
+++ b/linkcheck/checker/cache.py
@ -182,15 +182,11 @@ class Cache (object):
        finally:
            self.lock.release()

-    def robots_txt_allows_url (self, url_data):
+    def robots_txt_allows_url (self, roboturl, url, user, password):
        """ask robots.txt allowance"""
        self.lock.acquire()
        try:
-            roboturl = url_data.get_robots_txt_url()
-            linkcheck.log.debug(linkcheck.LOG_CACHE,
-                       "robots.txt url %r of %r", roboturl, url_data.url)
            if roboturl not in self.robots_txt:
-                user, password = url_data.get_user_password()
                rp = linkcheck.robotparser2.RobotFileParser(
                                                user=user, password=password)
                rp.set_url(roboturl)
@ -198,8 +194,7 @@ class Cache (object):
                self.robots_txt[roboturl] = rp
            else:
                rp = self.robots_txt[roboturl]
-            return rp.can_fetch(linkcheck.configuration.UserAgent,
-                                url_data.url)
+            return rp.can_fetch(linkcheck.configuration.UserAgent, url)
        finally:
            self.lock.release()

--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -66,6 +66,12 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
            self.urlparts[2] = '/'
            self.url = urlparse.urlunsplit(self.urlparts)

+    def allows_robots (self, url):
+        roboturl = self.get_robots_txt_url()
+        user, password = self.get_user_password()
+        return self.consumer.cache.robots_txt_allows_url(roboturl, url,
+                                                         user, password)
+
    def check_connection (self):
        """
        Check a URL with HTTP protocol.
@ -115,7 +121,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        self.headers = None
        self.auth = None
        self.cookies = []
-        if not self.consumer.cache.robots_txt_allows_url(self):
+        if not self.allows_robots(self.url):
            self.add_warning(
                       _("Access denied by robots.txt, checked only syntax"))
            return