mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-13 09:03:11 +00:00
Merge pull request #39 from PetrDlouhy/fix/cache
Fix cache: Don't check one url multiple times
This commit is contained in:
commit
5471b63ceb
11 changed files with 183 additions and 47 deletions
4
linkcheck/cache/results.py
vendored
4
linkcheck/cache/results.py
vendored
|
|
@ -59,6 +59,10 @@ class ResultCache(object):
|
|||
"""Non-thread-safe function for fast containment checks."""
|
||||
return key in self.cache
|
||||
|
||||
def has_non_empty_result(self, key):
|
||||
"""Non-thread-safe function for fast containment checks."""
|
||||
return self.cache.get(key)
|
||||
|
||||
def __len__(self):
|
||||
"""Get number of cached elements. This is not thread-safe and is
|
||||
likely to change before the returned value is used."""
|
||||
|
|
|
|||
7
linkcheck/cache/urlqueue.py
vendored
7
linkcheck/cache/urlqueue.py
vendored
|
|
@ -120,7 +120,9 @@ class UrlQueue (object):
|
|||
log.debug(LOG_CACHE, "queueing %s", url_data.url)
|
||||
key = url_data.cache_url
|
||||
cache = url_data.aggregate.result_cache
|
||||
if url_data.has_result or cache.has_result(key):
|
||||
if cache.has_result(key):
|
||||
return
|
||||
if url_data.has_result:
|
||||
self.queue.appendleft(url_data)
|
||||
else:
|
||||
assert key is not None, "no result for None key: %s" % url_data
|
||||
|
|
@ -131,6 +133,7 @@ class UrlQueue (object):
|
|||
self.cleanup()
|
||||
self.queue.append(url_data)
|
||||
self.unfinished_tasks += 1
|
||||
cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times
|
||||
|
||||
def cleanup(self):
|
||||
"""Move cached elements to top."""
|
||||
|
|
@ -139,7 +142,7 @@ class UrlQueue (object):
|
|||
for i, url_data in enumerate(self.queue):
|
||||
key = url_data.cache_url
|
||||
cache = url_data.aggregate.result_cache
|
||||
if cache.has_result(key):
|
||||
if cache.has_non_empty_result(key):
|
||||
cached.append(i)
|
||||
for pos in cached:
|
||||
self._move_to_top(pos)
|
||||
|
|
|
|||
0
tests/cache/__init__.py
vendored
Normal file
0
tests/cache/__init__.py
vendored
Normal file
174
tests/cache/test_urlqueue.py
vendored
Normal file
174
tests/cache/test_urlqueue.py
vendored
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2017 Petr Dlouhý
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
import unittest
|
||||
from collections import namedtuple
|
||||
|
||||
from linkcheck.cache.results import ResultCache
|
||||
from linkcheck.cache.urlqueue import Empty, NUM_PUTS_CLEANUP, UrlQueue
|
||||
|
||||
UrlData = namedtuple('UrlData', 'url cache_url aggregate has_result')
|
||||
Aggregate = namedtuple('Aggregate', 'result_cache')
|
||||
|
||||
|
||||
class TestUrlQueue(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.result_cache = ResultCache()
|
||||
self.urlqueue = UrlQueue()
|
||||
self.urldata1 = UrlData(
|
||||
url="Foo",
|
||||
cache_url="Foo",
|
||||
aggregate=Aggregate(
|
||||
result_cache=self.result_cache,
|
||||
),
|
||||
has_result=True,
|
||||
)
|
||||
|
||||
def test_max_allowed_urls_bad_value(self):
|
||||
with self.assertRaises(ValueError):
|
||||
UrlQueue(max_allowed_urls=0)
|
||||
with self.assertRaises(ValueError):
|
||||
UrlQueue(max_allowed_urls=-1)
|
||||
|
||||
def test_qsize(self):
|
||||
""" Test qsize() """
|
||||
self.assertEqual(self.urlqueue.qsize(), 0)
|
||||
self.urlqueue.put(self.urldata1)
|
||||
self.assertEqual(self.urlqueue.qsize(), 1)
|
||||
|
||||
def test_empty(self):
|
||||
""" Test empty() """
|
||||
self.assertEqual(self.urlqueue.empty(), True)
|
||||
self.urlqueue.put(self.urldata1)
|
||||
self.assertEqual(self.urlqueue.empty(), False)
|
||||
|
||||
def test_get_empty(self):
|
||||
""" Test, that get() with empty queue throws Empty """
|
||||
with self.assertRaises(Empty):
|
||||
self.assertEqual(self.urlqueue.get(0), None)
|
||||
|
||||
def test_get_negative_timeout(self):
|
||||
""" Test, that get() with negative timeout throws ValueError """
|
||||
with self.assertRaises(ValueError):
|
||||
self.assertEqual(self.urlqueue.get(-1), None)
|
||||
|
||||
def test_put_get(self):
|
||||
"""
|
||||
Test, that after put() we can get()
|
||||
the item and it can be get only once
|
||||
"""
|
||||
self.urlqueue.put(self.urldata1)
|
||||
cached_item = (
|
||||
self.result_cache.get_result(self.urldata1)
|
||||
)
|
||||
self.assertEqual(cached_item, None)
|
||||
self.assertEqual(self.urlqueue.get(), self.urldata1)
|
||||
with self.assertRaises(Empty):
|
||||
self.assertEqual(self.urlqueue.get(0), None)
|
||||
|
||||
def test_put_has_result_false(self):
|
||||
"""
|
||||
Test, that element with has_result=False
|
||||
is put() on the end of queue
|
||||
"""
|
||||
self.urlqueue.put(self.urldata1)
|
||||
urldata = UrlData(
|
||||
url="Bar",
|
||||
cache_url="Bar",
|
||||
aggregate=Aggregate(
|
||||
result_cache=self.result_cache,
|
||||
),
|
||||
has_result=False,
|
||||
)
|
||||
self.urlqueue.put(urldata)
|
||||
self.assertEqual(self.urlqueue.get(), self.urldata1)
|
||||
self.assertEqual(self.urlqueue.get(), urldata)
|
||||
with self.assertRaises(Empty):
|
||||
self.assertEqual(self.urlqueue.get(0), None)
|
||||
|
||||
def test_put_has_result_true(self):
|
||||
"""
|
||||
Test, that element with has_result=True
|
||||
is put() on the beginning of queue
|
||||
"""
|
||||
self.urlqueue.put(self.urldata1)
|
||||
urldata = UrlData(
|
||||
url="Bar",
|
||||
cache_url="Bar",
|
||||
aggregate=Aggregate(
|
||||
result_cache=self.result_cache,
|
||||
),
|
||||
has_result=True,
|
||||
)
|
||||
self.urlqueue.put(urldata)
|
||||
self.assertEqual(self.urlqueue.get(), urldata)
|
||||
self.assertEqual(self.urlqueue.get(), self.urldata1)
|
||||
with self.assertRaises(Empty):
|
||||
self.assertEqual(self.urlqueue.get(0), None)
|
||||
|
||||
def test_put_cache(self):
|
||||
"""
|
||||
Test, that making put() on two elements with same
|
||||
cache_url adds only one element
|
||||
"""
|
||||
self.urlqueue.put(self.urldata1)
|
||||
urldata = UrlData(
|
||||
url="Bar",
|
||||
cache_url="Foo",
|
||||
aggregate=Aggregate(
|
||||
result_cache=self.result_cache,
|
||||
),
|
||||
has_result=True,
|
||||
)
|
||||
self.urlqueue.put(urldata)
|
||||
self.assertEqual(self.urlqueue.qsize(), 1)
|
||||
self.assertEqual(self.urlqueue.get(), self.urldata1)
|
||||
with self.assertRaises(Empty):
|
||||
self.assertEqual(self.urlqueue.get(0), None)
|
||||
|
||||
def test_cleanup(self):
|
||||
"""
|
||||
Test, that after adding NUM_PUTS_CLEANUP elements
|
||||
the queue is cleaned up.
|
||||
Whether the cleanup is was performed is determined,
|
||||
that element in cache is now on top of the queue.
|
||||
"""
|
||||
for i in range(NUM_PUTS_CLEANUP - 1):
|
||||
self.urlqueue.put(
|
||||
UrlData(
|
||||
url="Bar",
|
||||
cache_url="Bar address %s" % i,
|
||||
aggregate=Aggregate(
|
||||
result_cache=self.result_cache,
|
||||
),
|
||||
has_result=False,
|
||||
),
|
||||
)
|
||||
self.assertEqual(self.urlqueue.qsize(), NUM_PUTS_CLEANUP - 1)
|
||||
urldata = UrlData(
|
||||
url="Bar",
|
||||
cache_url="Bar address",
|
||||
aggregate=Aggregate(
|
||||
result_cache=self.result_cache,
|
||||
),
|
||||
has_result=False,
|
||||
)
|
||||
self.result_cache.add_result("Bar address 2", "asdf")
|
||||
self.urlqueue.put(urldata)
|
||||
self.assertEqual(self.urlqueue.qsize(), NUM_PUTS_CLEANUP)
|
||||
self.assertEqual(self.urlqueue.get().cache_url, "Bar address 2")
|
||||
|
|
@ -10,9 +10,3 @@ real url javascript:loadthis()
|
|||
name javascript url
|
||||
info Javascript URL ignored.
|
||||
valid
|
||||
|
||||
url file.html
|
||||
cache key file://%(curdir)s/%(datadir)s/file.html
|
||||
real url file://%(curdir)s/%(datadir)s/file.html
|
||||
name relative url
|
||||
valid
|
||||
|
|
|
|||
|
|
@ -9,11 +9,6 @@ real url clsid:12345-67890
|
|||
info Clsid URL ignored.
|
||||
valid
|
||||
|
||||
url
|
||||
cache key http://localhost:%(port)d/%(datadir)s/http.html
|
||||
real url http://localhost:%(port)d/%(datadir)s/http.html
|
||||
valid
|
||||
|
||||
url dns://www.example.org
|
||||
cache key dns://www.example.org
|
||||
real url dns://www.example.org
|
||||
|
|
|
|||
|
|
@ -8,9 +8,3 @@ cache key http://www.example.com/
|
|||
real url http://www.example.com/
|
||||
name l1
|
||||
valid
|
||||
|
||||
url HTTP://WWW.EXAMPLE.COM/
|
||||
cache key http://www.example.com/
|
||||
real url http://www.example.com/
|
||||
name l2
|
||||
valid
|
||||
|
|
|
|||
|
|
@ -38,9 +38,3 @@ cache key http://www.example.com/
|
|||
real url http://www.example.com/
|
||||
name ok example
|
||||
valid
|
||||
|
||||
url //www.example.com/
|
||||
cache key http://www.example.com/
|
||||
real url http://www.example.com/
|
||||
name no scheme example
|
||||
valid
|
||||
|
|
|
|||
|
|
@ -4,11 +4,6 @@ real url file://%(curdir)s/%(datadir)s/misc.html
|
|||
name %(datadir)s/misc.html
|
||||
valid
|
||||
|
||||
url
|
||||
cache key file://%(curdir)s/%(datadir)s/misc.html
|
||||
real url file://%(curdir)s/%(datadir)s/misc.html
|
||||
valid
|
||||
|
||||
url http://www.example.com/
|
||||
cache key http://www.example.com/
|
||||
real url http://www.example.com/
|
||||
|
|
@ -19,11 +14,6 @@ cache key file://%(curdir)s/%(datadir)s/favicon.ico
|
|||
real url file://%(curdir)s/%(datadir)s/favicon.ico
|
||||
valid
|
||||
|
||||
url favicon.ico
|
||||
cache key file://%(curdir)s/%(datadir)s/favicon.ico
|
||||
real url file://%(curdir)s/%(datadir)s/favicon.ico
|
||||
valid
|
||||
|
||||
url test.swf
|
||||
cache key file://%(curdir)s/%(datadir)s/test.swf
|
||||
real url file://%(curdir)s/%(datadir)s/test.swf
|
||||
|
|
|
|||
|
|
@ -169,10 +169,5 @@ class TestFile (LinkCheckTest):
|
|||
u"real url %s" % nurl3,
|
||||
u"name t.txt",
|
||||
u"valid",
|
||||
u"url t.txt",
|
||||
u"cache key %s" % nurl3,
|
||||
u"real url %s" % nurl3,
|
||||
u"name External link",
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=2)
|
||||
|
|
|
|||
|
|
@ -57,13 +57,6 @@ class TestHttpRedirect (HttpServerTest):
|
|||
u"real url %s" % rurl,
|
||||
u"info Redirected to `%s'." % rurl,
|
||||
u"valid",
|
||||
u"url newurl.html",
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % rurl,
|
||||
u"name Recursive Redirect",
|
||||
# XXX the info is copied from the cached result
|
||||
u"info Redirected to `%s'." % rurl,
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=99)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue