Merge pull request #39 from PetrDlouhy/fix/cache

Fix cache: Don't check one url multiple times
This commit is contained in:
anarcat 2017-03-14 09:26:07 -04:00 committed by GitHub
commit 5471b63ceb
11 changed files with 183 additions and 47 deletions

View file

@ -59,6 +59,10 @@ class ResultCache(object):
"""Non-thread-safe function for fast containment checks."""
return key in self.cache
def has_non_empty_result(self, key):
"""Non-thread-safe function for fast containment checks."""
return self.cache.get(key)
def __len__(self):
"""Get number of cached elements. This is not thread-safe and is
likely to change before the returned value is used."""

View file

@ -120,7 +120,9 @@ class UrlQueue (object):
log.debug(LOG_CACHE, "queueing %s", url_data.url)
key = url_data.cache_url
cache = url_data.aggregate.result_cache
if url_data.has_result or cache.has_result(key):
if cache.has_result(key):
return
if url_data.has_result:
self.queue.appendleft(url_data)
else:
assert key is not None, "no result for None key: %s" % url_data
@ -131,6 +133,7 @@ class UrlQueue (object):
self.cleanup()
self.queue.append(url_data)
self.unfinished_tasks += 1
cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times
def cleanup(self):
"""Move cached elements to top."""
@ -139,7 +142,7 @@ class UrlQueue (object):
for i, url_data in enumerate(self.queue):
key = url_data.cache_url
cache = url_data.aggregate.result_cache
if cache.has_result(key):
if cache.has_non_empty_result(key):
cached.append(i)
for pos in cached:
self._move_to_top(pos)

0
tests/cache/__init__.py vendored Normal file
View file

174
tests/cache/test_urlqueue.py vendored Normal file
View file

@ -0,0 +1,174 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2017 Petr Dlouhý
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import unittest
from collections import namedtuple
from linkcheck.cache.results import ResultCache
from linkcheck.cache.urlqueue import Empty, NUM_PUTS_CLEANUP, UrlQueue
UrlData = namedtuple('UrlData', 'url cache_url aggregate has_result')
Aggregate = namedtuple('Aggregate', 'result_cache')
class TestUrlQueue(unittest.TestCase):
def setUp(self):
self.result_cache = ResultCache()
self.urlqueue = UrlQueue()
self.urldata1 = UrlData(
url="Foo",
cache_url="Foo",
aggregate=Aggregate(
result_cache=self.result_cache,
),
has_result=True,
)
def test_max_allowed_urls_bad_value(self):
with self.assertRaises(ValueError):
UrlQueue(max_allowed_urls=0)
with self.assertRaises(ValueError):
UrlQueue(max_allowed_urls=-1)
def test_qsize(self):
""" Test qsize() """
self.assertEqual(self.urlqueue.qsize(), 0)
self.urlqueue.put(self.urldata1)
self.assertEqual(self.urlqueue.qsize(), 1)
def test_empty(self):
""" Test empty() """
self.assertEqual(self.urlqueue.empty(), True)
self.urlqueue.put(self.urldata1)
self.assertEqual(self.urlqueue.empty(), False)
def test_get_empty(self):
""" Test, that get() with empty queue throws Empty """
with self.assertRaises(Empty):
self.assertEqual(self.urlqueue.get(0), None)
def test_get_negative_timeout(self):
""" Test, that get() with negative timeout throws ValueError """
with self.assertRaises(ValueError):
self.assertEqual(self.urlqueue.get(-1), None)
def test_put_get(self):
"""
Test, that after put() we can get()
the item and it can be get only once
"""
self.urlqueue.put(self.urldata1)
cached_item = (
self.result_cache.get_result(self.urldata1)
)
self.assertEqual(cached_item, None)
self.assertEqual(self.urlqueue.get(), self.urldata1)
with self.assertRaises(Empty):
self.assertEqual(self.urlqueue.get(0), None)
def test_put_has_result_false(self):
"""
Test, that element with has_result=False
is put() on the end of queue
"""
self.urlqueue.put(self.urldata1)
urldata = UrlData(
url="Bar",
cache_url="Bar",
aggregate=Aggregate(
result_cache=self.result_cache,
),
has_result=False,
)
self.urlqueue.put(urldata)
self.assertEqual(self.urlqueue.get(), self.urldata1)
self.assertEqual(self.urlqueue.get(), urldata)
with self.assertRaises(Empty):
self.assertEqual(self.urlqueue.get(0), None)
def test_put_has_result_true(self):
"""
Test, that element with has_result=True
is put() on the beginning of queue
"""
self.urlqueue.put(self.urldata1)
urldata = UrlData(
url="Bar",
cache_url="Bar",
aggregate=Aggregate(
result_cache=self.result_cache,
),
has_result=True,
)
self.urlqueue.put(urldata)
self.assertEqual(self.urlqueue.get(), urldata)
self.assertEqual(self.urlqueue.get(), self.urldata1)
with self.assertRaises(Empty):
self.assertEqual(self.urlqueue.get(0), None)
def test_put_cache(self):
"""
Test, that making put() on two elements with same
cache_url adds only one element
"""
self.urlqueue.put(self.urldata1)
urldata = UrlData(
url="Bar",
cache_url="Foo",
aggregate=Aggregate(
result_cache=self.result_cache,
),
has_result=True,
)
self.urlqueue.put(urldata)
self.assertEqual(self.urlqueue.qsize(), 1)
self.assertEqual(self.urlqueue.get(), self.urldata1)
with self.assertRaises(Empty):
self.assertEqual(self.urlqueue.get(0), None)
def test_cleanup(self):
"""
Test, that after adding NUM_PUTS_CLEANUP elements
the queue is cleaned up.
Whether the cleanup is was performed is determined,
that element in cache is now on top of the queue.
"""
for i in range(NUM_PUTS_CLEANUP - 1):
self.urlqueue.put(
UrlData(
url="Bar",
cache_url="Bar address %s" % i,
aggregate=Aggregate(
result_cache=self.result_cache,
),
has_result=False,
),
)
self.assertEqual(self.urlqueue.qsize(), NUM_PUTS_CLEANUP - 1)
urldata = UrlData(
url="Bar",
cache_url="Bar address",
aggregate=Aggregate(
result_cache=self.result_cache,
),
has_result=False,
)
self.result_cache.add_result("Bar address 2", "asdf")
self.urlqueue.put(urldata)
self.assertEqual(self.urlqueue.qsize(), NUM_PUTS_CLEANUP)
self.assertEqual(self.urlqueue.get().cache_url, "Bar address 2")

View file

@ -10,9 +10,3 @@ real url javascript:loadthis()
name javascript url
info Javascript URL ignored.
valid
url file.html
cache key file://%(curdir)s/%(datadir)s/file.html
real url file://%(curdir)s/%(datadir)s/file.html
name relative url
valid

View file

@ -9,11 +9,6 @@ real url clsid:12345-67890
info Clsid URL ignored.
valid
url
cache key http://localhost:%(port)d/%(datadir)s/http.html
real url http://localhost:%(port)d/%(datadir)s/http.html
valid
url dns://www.example.org
cache key dns://www.example.org
real url dns://www.example.org

View file

@ -8,9 +8,3 @@ cache key http://www.example.com/
real url http://www.example.com/
name l1
valid
url HTTP://WWW.EXAMPLE.COM/
cache key http://www.example.com/
real url http://www.example.com/
name l2
valid

View file

@ -38,9 +38,3 @@ cache key http://www.example.com/
real url http://www.example.com/
name ok example
valid
url //www.example.com/
cache key http://www.example.com/
real url http://www.example.com/
name no scheme example
valid

View file

@ -4,11 +4,6 @@ real url file://%(curdir)s/%(datadir)s/misc.html
name %(datadir)s/misc.html
valid
url
cache key file://%(curdir)s/%(datadir)s/misc.html
real url file://%(curdir)s/%(datadir)s/misc.html
valid
url http://www.example.com/
cache key http://www.example.com/
real url http://www.example.com/
@ -19,11 +14,6 @@ cache key file://%(curdir)s/%(datadir)s/favicon.ico
real url file://%(curdir)s/%(datadir)s/favicon.ico
valid
url favicon.ico
cache key file://%(curdir)s/%(datadir)s/favicon.ico
real url file://%(curdir)s/%(datadir)s/favicon.ico
valid
url test.swf
cache key file://%(curdir)s/%(datadir)s/test.swf
real url file://%(curdir)s/%(datadir)s/test.swf

View file

@ -169,10 +169,5 @@ class TestFile (LinkCheckTest):
u"real url %s" % nurl3,
u"name t.txt",
u"valid",
u"url t.txt",
u"cache key %s" % nurl3,
u"real url %s" % nurl3,
u"name External link",
u"valid",
]
self.direct(url, resultlines, recursionlevel=2)

View file

@ -57,13 +57,6 @@ class TestHttpRedirect (HttpServerTest):
u"real url %s" % rurl,
u"info Redirected to `%s'." % rurl,
u"valid",
u"url newurl.html",
u"cache key %s" % nurl,
u"real url %s" % rurl,
u"name Recursive Redirect",
# XXX the info is copied from the cached result
u"info Redirected to `%s'." % rurl,
u"valid",
]
self.direct(url, resultlines, recursionlevel=99)