Parse sitemap urls in robots.txt files.

2026-05-14 09:33:09 +00:00 · 2014-03-01 19:57:57 +01:00 · 2014-03-01 19:57:57 +01:00 · 0e4d6f6e1a
commit 0e4d6f6e1a
parent 78a99717fe
2 changed files with 14 additions and 0 deletions
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -50,6 +50,7 @@ class RobotFileParser (object):
        self.disallow_all = False
        self.allow_all = False
        self.last_checked = 0
+        self.sitemap_urls = []

    def mtime (self):
        """Returns the time the robots.txt file was last fetched.
@ -180,6 +181,11 @@ class RobotFileParser (object):
                        except ValueError:
                            log.debug(LOG_CHECK, "%r line %d: invalid delay number %r", self.url, linenumber, line[1])
                            pass
+                elif line[0] == "sitemap":
+                    # Note that sitemap URLs must be absolute according to
+                    # http://www.sitemaps.org/protocol.html#submit_robots
+                    # But this should be checked by the calling layer.
+                    self.sitemap_urls.append(line[1])
                else:
                    log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
                    pass
--- a/tests/test_robotstxt.py
+++ b/tests/test_robotstxt.py
@ -290,3 +290,11 @@ class TestRobotsTxt (unittest.TestCase):
        self.check_url("spam", "/cgi-bin/foo/bar", False)
        self.check_url("spam", "/cgi-bin?a=1", False)
        self.check_url("spam", "/", True)
+
+    def test_sitemap(self):
+        lines = [
+            "Sitemap: bla",
+        ]
+        self.rp.parse(lines)
+        self.assertTrue(len(self.rp.sitemap_urls) > 0)
+        self.assertTrue(self.rp.sitemap_urls[0] == "bla")