mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-14 09:33:09 +00:00
Parse sitemap urls in robots.txt files.
This commit is contained in:
parent
78a99717fe
commit
0e4d6f6e1a
2 changed files with 14 additions and 0 deletions
|
|
@ -50,6 +50,7 @@ class RobotFileParser (object):
|
|||
self.disallow_all = False
|
||||
self.allow_all = False
|
||||
self.last_checked = 0
|
||||
self.sitemap_urls = []
|
||||
|
||||
def mtime (self):
|
||||
"""Returns the time the robots.txt file was last fetched.
|
||||
|
|
@ -180,6 +181,11 @@ class RobotFileParser (object):
|
|||
except ValueError:
|
||||
log.debug(LOG_CHECK, "%r line %d: invalid delay number %r", self.url, linenumber, line[1])
|
||||
pass
|
||||
elif line[0] == "sitemap":
|
||||
# Note that sitemap URLs must be absolute according to
|
||||
# http://www.sitemaps.org/protocol.html#submit_robots
|
||||
# But this should be checked by the calling layer.
|
||||
self.sitemap_urls.append(line[1])
|
||||
else:
|
||||
log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -290,3 +290,11 @@ class TestRobotsTxt (unittest.TestCase):
|
|||
self.check_url("spam", "/cgi-bin/foo/bar", False)
|
||||
self.check_url("spam", "/cgi-bin?a=1", False)
|
||||
self.check_url("spam", "/", True)
|
||||
|
||||
def test_sitemap(self):
|
||||
lines = [
|
||||
"Sitemap: bla",
|
||||
]
|
||||
self.rp.parse(lines)
|
||||
self.assertTrue(len(self.rp.sitemap_urls) > 0)
|
||||
self.assertTrue(self.rp.sitemap_urls[0] == "bla")
|
||||
|
|
|
|||
Loading…
Reference in a new issue