diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 7bd52192..49e5a006 100644 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -50,6 +50,7 @@ class RobotFileParser (object): self.disallow_all = False self.allow_all = False self.last_checked = 0 + self.sitemap_urls = [] def mtime (self): """Returns the time the robots.txt file was last fetched. @@ -180,6 +181,11 @@ class RobotFileParser (object): except ValueError: log.debug(LOG_CHECK, "%r line %d: invalid delay number %r", self.url, linenumber, line[1]) pass + elif line[0] == "sitemap": + # Note that sitemap URLs must be absolute according to + # http://www.sitemaps.org/protocol.html#submit_robots + # But this should be checked by the calling layer. + self.sitemap_urls.append(line[1]) else: log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0]) pass diff --git a/tests/test_robotstxt.py b/tests/test_robotstxt.py index 4901675f..529b3e3d 100644 --- a/tests/test_robotstxt.py +++ b/tests/test_robotstxt.py @@ -290,3 +290,11 @@ class TestRobotsTxt (unittest.TestCase): self.check_url("spam", "/cgi-bin/foo/bar", False) self.check_url("spam", "/cgi-bin?a=1", False) self.check_url("spam", "/", True) + + def test_sitemap(self): + lines = [ + "Sitemap: bla", + ] + self.rp.parse(lines) + self.assertTrue(len(self.rp.sitemap_urls) > 0) + self.assertTrue(self.rp.sitemap_urls[0] == "bla")