From 6f205a25745b22fb3d91b906243a4f2317929464 Mon Sep 17 00:00:00 2001
From: Bastian Kleineidam <bastian.kleineidam@web.de>
Date: Sat, 1 Mar 2014 20:25:19 +0100
Subject: [PATCH] Support checking Sitemap: URLs in robots.txt files.

---
 doc/changelog.txt             |  1 +
 linkcheck/cache/robots_txt.py | 32 ++++++++++++++++++++++++++------
 linkcheck/checker/httpurl.py  |  5 +----
 linkcheck/robotparser2.py     |  3 ++-
 tests/test_robotstxt.py       |  2 +-
 5 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/doc/changelog.txt b/doc/changelog.txt
index 1c36c5d3..2f0cf038 100644
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@@ -7,6 +7,7 @@ Features:
 - checking: Add options to limit the number of requests per second,
   allowed URL schemes and maximum file or download size.
   Closes: GH bug #397, #465, #420
+- checking: Support checking Sitemap: URLs in robots.txt files.
 - gui: UI language can be changed dynamically.
   Closes: GH bug #391
 
diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py
index 47162dc3..dc430a0b 100644
--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@@ -21,6 +21,7 @@ from .. import robotparser2, configuration
 from ..containers import LFUCache
 from ..decorators import synchronized
 from ..lock import get_lock
+from ..checker import get_url_from
 
 
 # lock objects
@@ -42,27 +43,46 @@ class RobotsTxt (object):
         self.hits = self.misses = 0
         self.roboturl_locks = {}
 
-    def allows_url (self, roboturl, url, proxy, user, password):
+    def allows_url (self, url_data):
         """Ask robots.txt allowance."""
+        roboturl = url_data.get_robots_txt_url()
         with self.get_lock(roboturl):
-            return self._allows_url(roboturl, url, proxy, user, password)
+            return self._allows_url(url_data, roboturl)
 
-    def _allows_url (self, roboturl, url, proxy, user, password):
+    def _allows_url (self, url_data, roboturl):
         """Ask robots.txt allowance. Assumes only single thread per robots.txt
         URL calls this function."""
+        user, password = url_data.get_user_password()
         with cache_lock:
             if roboturl in self.cache:
                 self.hits += 1
                 rp = self.cache[roboturl]
-                return rp.can_fetch(self.useragent, url)
+                return rp.can_fetch(self.useragent, url_data.url)
             self.misses += 1
-        rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
+        rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user,
             password=password)
         rp.set_url(roboturl)
         rp.read()
         with cache_lock:
             self.cache[roboturl] = rp
-        return rp.can_fetch(self.useragent, url)
+        self.add_sitemap_urls(rp, url_data, roboturl)
+        return rp.can_fetch(self.useragent, url_data.url)
+
+    def add_sitemap_urls(self, rp, url_data, roboturl):
+        """Add sitemap URLs to queue."""
+        if not rp.sitemap_urls:
+            return
+        rec_level = url_data.aggregate.config["recursionlevel"]
+        if rec_level >= 0 and url_data.recursion_level >= rec_level:
+            return
+        for sitemap_url, line in rp.sitemap_urls:
+            sitemap_url_data = get_url_from(sitemap_url,
+                url_data.recursion_level+1, url_data.aggregate,
+                parent_url=roboturl, line=line,
+                parent_content_type=url_data.content_type)
+            if sitemap_url_data.has_result or not sitemap_url_data.extern[1]:
+                # Only queue URLs which have a result or are not strict extern.
+                url_data.aggregate.urlqueue.put(sitemap_url_data)
 
     @synchronized(robot_lock)
     def get_lock(self, roboturl):
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 1e43d9c1..9ac311dc 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -60,10 +60,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         @return: True if access is granted, otherwise False
         @rtype: bool
         """
-        roboturl = self.get_robots_txt_url()
-        user, password = self.get_user_password()
-        rb = self.aggregate.robots_txt
-        return rb.allows_url(roboturl, self.url, self.proxy, user, password)
+        return self.aggregate.robots_txt.allows_url(self)
 
     def add_size_info (self):
         """Get size of URL content from HTTP header."""
diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py
index 1adf208a..9455b835 100644
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@@ -50,6 +50,7 @@ class RobotFileParser (object):
         self.disallow_all = False
         self.allow_all = False
         self.last_checked = 0
+        # list of tuples (sitemap url, line number)
         self.sitemap_urls = []
 
     def mtime (self):
@@ -185,7 +186,7 @@ class RobotFileParser (object):
                     # Note that sitemap URLs must be absolute according to
                     # http://www.sitemaps.org/protocol.html#submit_robots
                     # But this should be checked by the calling layer.
-                    self.sitemap_urls.append(line[1])
+                    self.sitemap_urls.append((line[1], linenumber))
                 else:
                     log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
                     pass
diff --git a/tests/test_robotstxt.py b/tests/test_robotstxt.py
index 529b3e3d..8b528349 100644
--- a/tests/test_robotstxt.py
+++ b/tests/test_robotstxt.py
@@ -297,4 +297,4 @@ class TestRobotsTxt (unittest.TestCase):
         ]
         self.rp.parse(lines)
         self.assertTrue(len(self.rp.sitemap_urls) > 0)
-        self.assertTrue(self.rp.sitemap_urls[0] == "bla")
+        self.assertTrue(self.rp.sitemap_urls[0] == ("bla", 1))