From eaa538c814f31ad86a84843cb1e7777c66370c2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Wed, 9 Nov 2016 16:21:45 +0100 Subject: [PATCH] don't check one url multiple times --- linkcheck/cache/results.py | 4 ++++ linkcheck/cache/urlqueue.py | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/linkcheck/cache/results.py b/linkcheck/cache/results.py index bd0d9b4b..eb9bac75 100644 --- a/linkcheck/cache/results.py +++ b/linkcheck/cache/results.py @@ -59,6 +59,10 @@ class ResultCache(object): """Non-thread-safe function for fast containment checks.""" return key in self.cache + def has_non_empty_result(self, key): + """Non-thread-safe function for fast containment checks.""" + return self.cache.get(key) + def __len__(self): """Get number of cached elements. This is not thread-safe and is likely to change before the returned value is used.""" diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py index 53c28280..9401b623 100644 --- a/linkcheck/cache/urlqueue.py +++ b/linkcheck/cache/urlqueue.py @@ -120,7 +120,9 @@ class UrlQueue (object): log.debug(LOG_CACHE, "queueing %s", url_data.url) key = url_data.cache_url cache = url_data.aggregate.result_cache - if url_data.has_result or cache.has_result(key): + if cache.has_result(key): + return + if url_data.has_result: self.queue.appendleft(url_data) else: assert key is not None, "no result for None key: %s" % url_data @@ -131,6 +133,7 @@ class UrlQueue (object): self.cleanup() self.queue.append(url_data) self.unfinished_tasks += 1 + cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times def cleanup(self): """Move cached elements to top.""" @@ -139,7 +142,7 @@ class UrlQueue (object): for i, url_data in enumerate(self.queue): key = url_data.cache_url cache = url_data.aggregate.result_cache - if cache.has_result(key): + if cache.has_non_empty_result(key): cached.append(i) for pos in cached: self._move_to_top(pos)