From 6f205a25745b22fb3d91b906243a4f2317929464 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Sat, 1 Mar 2014 20:25:19 +0100 Subject: [PATCH] Support checking Sitemap: URLs in robots.txt files. --- doc/changelog.txt | 1 + linkcheck/cache/robots_txt.py | 32 ++++++++++++++++++++++++++------ linkcheck/checker/httpurl.py | 5 +---- linkcheck/robotparser2.py | 3 ++- tests/test_robotstxt.py | 2 +- 5 files changed, 31 insertions(+), 12 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index 1c36c5d3..2f0cf038 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -7,6 +7,7 @@ Features: - checking: Add options to limit the number of requests per second, allowed URL schemes and maximum file or download size. Closes: GH bug #397, #465, #420 +- checking: Support checking Sitemap: URLs in robots.txt files. - gui: UI language can be changed dynamically. Closes: GH bug #391 diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index 47162dc3..dc430a0b 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -21,6 +21,7 @@ from .. import robotparser2, configuration from ..containers import LFUCache from ..decorators import synchronized from ..lock import get_lock +from ..checker import get_url_from # lock objects @@ -42,27 +43,46 @@ class RobotsTxt (object): self.hits = self.misses = 0 self.roboturl_locks = {} - def allows_url (self, roboturl, url, proxy, user, password): + def allows_url (self, url_data): """Ask robots.txt allowance.""" + roboturl = url_data.get_robots_txt_url() with self.get_lock(roboturl): - return self._allows_url(roboturl, url, proxy, user, password) + return self._allows_url(url_data, roboturl) - def _allows_url (self, roboturl, url, proxy, user, password): + def _allows_url (self, url_data, roboturl): """Ask robots.txt allowance. Assumes only single thread per robots.txt URL calls this function.""" + user, password = url_data.get_user_password() with cache_lock: if roboturl in self.cache: self.hits += 1 rp = self.cache[roboturl] - return rp.can_fetch(self.useragent, url) + return rp.can_fetch(self.useragent, url_data.url) self.misses += 1 - rp = robotparser2.RobotFileParser(proxy=proxy, user=user, + rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user, password=password) rp.set_url(roboturl) rp.read() with cache_lock: self.cache[roboturl] = rp - return rp.can_fetch(self.useragent, url) + self.add_sitemap_urls(rp, url_data, roboturl) + return rp.can_fetch(self.useragent, url_data.url) + + def add_sitemap_urls(self, rp, url_data, roboturl): + """Add sitemap URLs to queue.""" + if not rp.sitemap_urls: + return + rec_level = url_data.aggregate.config["recursionlevel"] + if rec_level >= 0 and url_data.recursion_level >= rec_level: + return + for sitemap_url, line in rp.sitemap_urls: + sitemap_url_data = get_url_from(sitemap_url, + url_data.recursion_level+1, url_data.aggregate, + parent_url=roboturl, line=line, + parent_content_type=url_data.content_type) + if sitemap_url_data.has_result or not sitemap_url_data.extern[1]: + # Only queue URLs which have a result or are not strict extern. + url_data.aggregate.urlqueue.put(sitemap_url_data) @synchronized(robot_lock) def get_lock(self, roboturl): diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 1e43d9c1..9ac311dc 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -60,10 +60,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): @return: True if access is granted, otherwise False @rtype: bool """ - roboturl = self.get_robots_txt_url() - user, password = self.get_user_password() - rb = self.aggregate.robots_txt - return rb.allows_url(roboturl, self.url, self.proxy, user, password) + return self.aggregate.robots_txt.allows_url(self) def add_size_info (self): """Get size of URL content from HTTP header.""" diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 1adf208a..9455b835 100644 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -50,6 +50,7 @@ class RobotFileParser (object): self.disallow_all = False self.allow_all = False self.last_checked = 0 + # list of tuples (sitemap url, line number) self.sitemap_urls = [] def mtime (self): @@ -185,7 +186,7 @@ class RobotFileParser (object): # Note that sitemap URLs must be absolute according to # http://www.sitemaps.org/protocol.html#submit_robots # But this should be checked by the calling layer. - self.sitemap_urls.append(line[1]) + self.sitemap_urls.append((line[1], linenumber)) else: log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0]) pass diff --git a/tests/test_robotstxt.py b/tests/test_robotstxt.py index 529b3e3d..8b528349 100644 --- a/tests/test_robotstxt.py +++ b/tests/test_robotstxt.py @@ -297,4 +297,4 @@ class TestRobotsTxt (unittest.TestCase): ] self.rp.parse(lines) self.assertTrue(len(self.rp.sitemap_urls) > 0) - self.assertTrue(self.rp.sitemap_urls[0] == "bla") + self.assertTrue(self.rp.sitemap_urls[0] == ("bla", 1))