From 22caa9367aea2e9eed9855b76a2c8ce0c3b362cb Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Thu, 10 Apr 2014 17:50:55 +0200 Subject: [PATCH] Refactor recursion checks. --- linkcheck/cache/robots_txt.py | 7 +------ linkcheck/checker/urlbase.py | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index fab3262e..d8e58590 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -69,12 +69,7 @@ class RobotsTxt (object): def add_sitemap_urls(self, rp, url_data, roboturl): """Add sitemap URLs to queue.""" - if not rp.sitemap_urls: - return - rec_level = url_data.aggregate.config["recursionlevel"] - if rec_level >= 0 and url_data.recursion_level >= rec_level: - return - if url_data.extern[0]: + if not rp.sitemap_urls or not url_data.allows_simple_recursion(): return for sitemap_url, line in rp.sitemap_urls: url_data.add_url(sitemap_url, line=line) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 23f4b7fa..7fef2eb0 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -519,6 +519,17 @@ class UrlBase (object): maxbytes=strformat.strsize(maxbytes)), tag=WARN_URL_CONTENT_SIZE_TOO_LARGE) + def allows_simple_recursion(self): + """Check recursion level and extern status.""" + rec_level = self.aggregate.config["recursionlevel"] + if rec_level >= 0 and self.recursion_level >= rec_level: + log.debug(LOG_CHECK, "... no, maximum recursion level reached.") + return False + if self.extern[0]: + log.debug(LOG_CHECK, "... no, extern.") + return False + return True + def allows_recursion (self): """ Return True iff we can recurse into the url's content. @@ -530,12 +541,7 @@ class UrlBase (object): if not self.can_get_content(): log.debug(LOG_CHECK, "... no, cannot get content.") return False - rec_level = self.aggregate.config["recursionlevel"] - if rec_level >= 0 and self.recursion_level >= rec_level: - log.debug(LOG_CHECK, "... no, maximum recursion level reached.") - return False - if self.extern[0]: - log.debug(LOG_CHECK, "... no, extern.") + if not self.allows_simple_recursion(): return False if self.size > self.aggregate.config["maxfilesizeparse"]: log.debug(LOG_CHECK, "... no, maximum parse size.")