diff --git a/linkcheck/cache/results.py b/linkcheck/cache/results.py index bd0d9b4b..eb9bac75 100644 --- a/linkcheck/cache/results.py +++ b/linkcheck/cache/results.py @@ -59,6 +59,10 @@ class ResultCache(object): """Non-thread-safe function for fast containment checks.""" return key in self.cache + def has_non_empty_result(self, key): + """Non-thread-safe function for fast containment checks.""" + return self.cache.get(key) + def __len__(self): """Get number of cached elements. This is not thread-safe and is likely to change before the returned value is used.""" diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py index 53c28280..9401b623 100644 --- a/linkcheck/cache/urlqueue.py +++ b/linkcheck/cache/urlqueue.py @@ -120,7 +120,9 @@ class UrlQueue (object): log.debug(LOG_CACHE, "queueing %s", url_data.url) key = url_data.cache_url cache = url_data.aggregate.result_cache - if url_data.has_result or cache.has_result(key): + if cache.has_result(key): + return + if url_data.has_result: self.queue.appendleft(url_data) else: assert key is not None, "no result for None key: %s" % url_data @@ -131,6 +133,7 @@ class UrlQueue (object): self.cleanup() self.queue.append(url_data) self.unfinished_tasks += 1 + cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times def cleanup(self): """Move cached elements to top.""" @@ -139,7 +142,7 @@ class UrlQueue (object): for i, url_data in enumerate(self.queue): key = url_data.cache_url cache = url_data.aggregate.result_cache - if cache.has_result(key): + if cache.has_non_empty_result(key): cached.append(i) for pos in cached: self._move_to_top(pos)