Support checking Sitemap: URLs in robots.txt files.

2026-05-14 17:43:11 +00:00 · 2014-03-01 20:25:19 +01:00 · 2014-03-01 20:25:19 +01:00 · 6f205a2574
commit 6f205a2574
parent 0f0d79c7e0
5 changed files with 31 additions and 12 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -7,6 +7,7 @@ Features:
 - checking: Add options to limit the number of requests per second,
  allowed URL schemes and maximum file or download size.
  Closes: GH bug #397, #465, #420
+- checking: Support checking Sitemap: URLs in robots.txt files.
 - gui: UI language can be changed dynamically.
  Closes: GH bug #391

--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -21,6 +21,7 @@ from .. import robotparser2, configuration
 from ..containers import LFUCache
 from ..decorators import synchronized
 from ..lock import get_lock
+from ..checker import get_url_from


 # lock objects
@ -42,27 +43,46 @@ class RobotsTxt (object):
        self.hits = self.misses = 0
        self.roboturl_locks = {}

-    def allows_url (self, roboturl, url, proxy, user, password):
+    def allows_url (self, url_data):
        """Ask robots.txt allowance."""
+        roboturl = url_data.get_robots_txt_url()
        with self.get_lock(roboturl):
-            return self._allows_url(roboturl, url, proxy, user, password)
+            return self._allows_url(url_data, roboturl)

-    def _allows_url (self, roboturl, url, proxy, user, password):
+    def _allows_url (self, url_data, roboturl):
        """Ask robots.txt allowance. Assumes only single thread per robots.txt
        URL calls this function."""
+        user, password = url_data.get_user_password()
        with cache_lock:
            if roboturl in self.cache:
                self.hits += 1
                rp = self.cache[roboturl]
-                return rp.can_fetch(self.useragent, url)
+                return rp.can_fetch(self.useragent, url_data.url)
            self.misses += 1
-        rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
+        rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user,
            password=password)
        rp.set_url(roboturl)
        rp.read()
        with cache_lock:
            self.cache[roboturl] = rp
-        return rp.can_fetch(self.useragent, url)
+        self.add_sitemap_urls(rp, url_data, roboturl)
+        return rp.can_fetch(self.useragent, url_data.url)
+
+    def add_sitemap_urls(self, rp, url_data, roboturl):
+        """Add sitemap URLs to queue."""
+        if not rp.sitemap_urls:
+            return
+        rec_level = url_data.aggregate.config["recursionlevel"]
+        if rec_level >= 0 and url_data.recursion_level >= rec_level:
+            return
+        for sitemap_url, line in rp.sitemap_urls:
+            sitemap_url_data = get_url_from(sitemap_url,
+                url_data.recursion_level+1, url_data.aggregate,
+                parent_url=roboturl, line=line,
+                parent_content_type=url_data.content_type)
+            if sitemap_url_data.has_result or not sitemap_url_data.extern[1]:
+                # Only queue URLs which have a result or are not strict extern.
+                url_data.aggregate.urlqueue.put(sitemap_url_data)

    @synchronized(robot_lock)
    def get_lock(self, roboturl):
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -60,10 +60,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        @return: True if access is granted, otherwise False
        @rtype: bool
        """
-        roboturl = self.get_robots_txt_url()
-        user, password = self.get_user_password()
-        rb = self.aggregate.robots_txt
-        return rb.allows_url(roboturl, self.url, self.proxy, user, password)
+        return self.aggregate.robots_txt.allows_url(self)

    def add_size_info (self):
        """Get size of URL content from HTTP header."""
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -50,6 +50,7 @@ class RobotFileParser (object):
        self.disallow_all = False
        self.allow_all = False
        self.last_checked = 0
+        # list of tuples (sitemap url, line number)
        self.sitemap_urls = []

    def mtime (self):
@ -185,7 +186,7 @@ class RobotFileParser (object):
                    # Note that sitemap URLs must be absolute according to
                    # http://www.sitemaps.org/protocol.html#submit_robots
                    # But this should be checked by the calling layer.
-                    self.sitemap_urls.append(line[1])
+                    self.sitemap_urls.append((line[1], linenumber))
                else:
                    log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
                    pass
--- a/tests/test_robotstxt.py
+++ b/tests/test_robotstxt.py
@ -297,4 +297,4 @@ class TestRobotsTxt (unittest.TestCase):
        ]
        self.rp.parse(lines)
        self.assertTrue(len(self.rp.sitemap_urls) > 0)
-        self.assertTrue(self.rp.sitemap_urls[0] == "bla")
+        self.assertTrue(self.rp.sitemap_urls[0] == ("bla", 1))