Support checking Sitemap: URLs in robots.txt files.

This commit is contained in:
Bastian Kleineidam 2014-03-01 20:25:19 +01:00
parent 0f0d79c7e0
commit 6f205a2574
5 changed files with 31 additions and 12 deletions

View file

@ -7,6 +7,7 @@ Features:
- checking: Add options to limit the number of requests per second,
allowed URL schemes and maximum file or download size.
Closes: GH bug #397, #465, #420
- checking: Support checking Sitemap: URLs in robots.txt files.
- gui: UI language can be changed dynamically.
Closes: GH bug #391

View file

@ -21,6 +21,7 @@ from .. import robotparser2, configuration
from ..containers import LFUCache
from ..decorators import synchronized
from ..lock import get_lock
from ..checker import get_url_from
# lock objects
@ -42,27 +43,46 @@ class RobotsTxt (object):
self.hits = self.misses = 0
self.roboturl_locks = {}
def allows_url (self, roboturl, url, proxy, user, password):
def allows_url (self, url_data):
"""Ask robots.txt allowance."""
roboturl = url_data.get_robots_txt_url()
with self.get_lock(roboturl):
return self._allows_url(roboturl, url, proxy, user, password)
return self._allows_url(url_data, roboturl)
def _allows_url (self, roboturl, url, proxy, user, password):
def _allows_url (self, url_data, roboturl):
"""Ask robots.txt allowance. Assumes only single thread per robots.txt
URL calls this function."""
user, password = url_data.get_user_password()
with cache_lock:
if roboturl in self.cache:
self.hits += 1
rp = self.cache[roboturl]
return rp.can_fetch(self.useragent, url)
return rp.can_fetch(self.useragent, url_data.url)
self.misses += 1
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user,
password=password)
rp.set_url(roboturl)
rp.read()
with cache_lock:
self.cache[roboturl] = rp
return rp.can_fetch(self.useragent, url)
self.add_sitemap_urls(rp, url_data, roboturl)
return rp.can_fetch(self.useragent, url_data.url)
def add_sitemap_urls(self, rp, url_data, roboturl):
"""Add sitemap URLs to queue."""
if not rp.sitemap_urls:
return
rec_level = url_data.aggregate.config["recursionlevel"]
if rec_level >= 0 and url_data.recursion_level >= rec_level:
return
for sitemap_url, line in rp.sitemap_urls:
sitemap_url_data = get_url_from(sitemap_url,
url_data.recursion_level+1, url_data.aggregate,
parent_url=roboturl, line=line,
parent_content_type=url_data.content_type)
if sitemap_url_data.has_result or not sitemap_url_data.extern[1]:
# Only queue URLs which have a result or are not strict extern.
url_data.aggregate.urlqueue.put(sitemap_url_data)
@synchronized(robot_lock)
def get_lock(self, roboturl):

View file

@ -60,10 +60,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
@return: True if access is granted, otherwise False
@rtype: bool
"""
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
rb = self.aggregate.robots_txt
return rb.allows_url(roboturl, self.url, self.proxy, user, password)
return self.aggregate.robots_txt.allows_url(self)
def add_size_info (self):
"""Get size of URL content from HTTP header."""

View file

@ -50,6 +50,7 @@ class RobotFileParser (object):
self.disallow_all = False
self.allow_all = False
self.last_checked = 0
# list of tuples (sitemap url, line number)
self.sitemap_urls = []
def mtime (self):
@ -185,7 +186,7 @@ class RobotFileParser (object):
# Note that sitemap URLs must be absolute according to
# http://www.sitemaps.org/protocol.html#submit_robots
# But this should be checked by the calling layer.
self.sitemap_urls.append(line[1])
self.sitemap_urls.append((line[1], linenumber))
else:
log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
pass

View file

@ -297,4 +297,4 @@ class TestRobotsTxt (unittest.TestCase):
]
self.rp.parse(lines)
self.assertTrue(len(self.rp.sitemap_urls) > 0)
self.assertTrue(self.rp.sitemap_urls[0] == "bla")
self.assertTrue(self.rp.sitemap_urls[0] == ("bla", 1))