Refactor recursion checks.

This commit is contained in:
Bastian Kleineidam 2014-04-10 17:50:55 +02:00
parent 08fbd891ef
commit 22caa9367a
2 changed files with 13 additions and 12 deletions

View file

@ -69,12 +69,7 @@ class RobotsTxt (object):
def add_sitemap_urls(self, rp, url_data, roboturl):
"""Add sitemap URLs to queue."""
if not rp.sitemap_urls:
return
rec_level = url_data.aggregate.config["recursionlevel"]
if rec_level >= 0 and url_data.recursion_level >= rec_level:
return
if url_data.extern[0]:
if not rp.sitemap_urls or not url_data.allows_simple_recursion():
return
for sitemap_url, line in rp.sitemap_urls:
url_data.add_url(sitemap_url, line=line)

View file

@ -519,6 +519,17 @@ class UrlBase (object):
maxbytes=strformat.strsize(maxbytes)),
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
def allows_simple_recursion(self):
"""Check recursion level and extern status."""
rec_level = self.aggregate.config["recursionlevel"]
if rec_level >= 0 and self.recursion_level >= rec_level:
log.debug(LOG_CHECK, "... no, maximum recursion level reached.")
return False
if self.extern[0]:
log.debug(LOG_CHECK, "... no, extern.")
return False
return True
def allows_recursion (self):
"""
Return True iff we can recurse into the url's content.
@ -530,12 +541,7 @@ class UrlBase (object):
if not self.can_get_content():
log.debug(LOG_CHECK, "... no, cannot get content.")
return False
rec_level = self.aggregate.config["recursionlevel"]
if rec_level >= 0 and self.recursion_level >= rec_level:
log.debug(LOG_CHECK, "... no, maximum recursion level reached.")
return False
if self.extern[0]:
log.debug(LOG_CHECK, "... no, extern.")
if not self.allows_simple_recursion():
return False
if self.size > self.aggregate.config["maxfilesizeparse"]:
log.debug(LOG_CHECK, "... no, maximum parse size.")