don't check one url multiple times

This commit is contained in:
Petr Dlouhý 2016-11-09 16:21:45 +01:00
parent bf66006375
commit eaa538c814
2 changed files with 9 additions and 2 deletions

View file

@ -59,6 +59,10 @@ class ResultCache(object):
"""Non-thread-safe function for fast containment checks."""
return key in self.cache
def has_non_empty_result(self, key):
"""Non-thread-safe function for fast containment checks."""
return self.cache.get(key)
def __len__(self):
"""Get number of cached elements. This is not thread-safe and is
likely to change before the returned value is used."""

View file

@ -120,7 +120,9 @@ class UrlQueue (object):
log.debug(LOG_CACHE, "queueing %s", url_data.url)
key = url_data.cache_url
cache = url_data.aggregate.result_cache
if url_data.has_result or cache.has_result(key):
if cache.has_result(key):
return
if url_data.has_result:
self.queue.appendleft(url_data)
else:
assert key is not None, "no result for None key: %s" % url_data
@ -131,6 +133,7 @@ class UrlQueue (object):
self.cleanup()
self.queue.append(url_data)
self.unfinished_tasks += 1
cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times
def cleanup(self):
"""Move cached elements to top."""
@ -139,7 +142,7 @@ class UrlQueue (object):
for i, url_data in enumerate(self.queue):
key = url_data.cache_url
cache = url_data.aggregate.result_cache
if cache.has_result(key):
if cache.has_non_empty_result(key):
cached.append(i)
for pos in cached:
self._move_to_top(pos)