Parse sitemap urls in robots.txt files.

This commit is contained in:
Bastian Kleineidam 2014-03-01 19:57:57 +01:00
parent 78a99717fe
commit 0e4d6f6e1a
2 changed files with 14 additions and 0 deletions

View file

@ -50,6 +50,7 @@ class RobotFileParser (object):
self.disallow_all = False
self.allow_all = False
self.last_checked = 0
self.sitemap_urls = []
def mtime (self):
"""Returns the time the robots.txt file was last fetched.
@ -180,6 +181,11 @@ class RobotFileParser (object):
except ValueError:
log.debug(LOG_CHECK, "%r line %d: invalid delay number %r", self.url, linenumber, line[1])
pass
elif line[0] == "sitemap":
# Note that sitemap URLs must be absolute according to
# http://www.sitemaps.org/protocol.html#submit_robots
# But this should be checked by the calling layer.
self.sitemap_urls.append(line[1])
else:
log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
pass

View file

@ -290,3 +290,11 @@ class TestRobotsTxt (unittest.TestCase):
self.check_url("spam", "/cgi-bin/foo/bar", False)
self.check_url("spam", "/cgi-bin?a=1", False)
self.check_url("spam", "/", True)
def test_sitemap(self):
lines = [
"Sitemap: bla",
]
self.rp.parse(lines)
self.assertTrue(len(self.rp.sitemap_urls) > 0)
self.assertTrue(self.rp.sitemap_urls[0] == "bla")