From eaa538c814f31ad86a84843cb1e7777c66370c2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Wed, 9 Nov 2016 16:21:45 +0100 Subject: [PATCH 1/3] don't check one url multiple times --- linkcheck/cache/results.py | 4 ++++ linkcheck/cache/urlqueue.py | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/linkcheck/cache/results.py b/linkcheck/cache/results.py index bd0d9b4b..eb9bac75 100644 --- a/linkcheck/cache/results.py +++ b/linkcheck/cache/results.py @@ -59,6 +59,10 @@ class ResultCache(object): """Non-thread-safe function for fast containment checks.""" return key in self.cache + def has_non_empty_result(self, key): + """Non-thread-safe function for fast containment checks.""" + return self.cache.get(key) + def __len__(self): """Get number of cached elements. This is not thread-safe and is likely to change before the returned value is used.""" diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py index 53c28280..9401b623 100644 --- a/linkcheck/cache/urlqueue.py +++ b/linkcheck/cache/urlqueue.py @@ -120,7 +120,9 @@ class UrlQueue (object): log.debug(LOG_CACHE, "queueing %s", url_data.url) key = url_data.cache_url cache = url_data.aggregate.result_cache - if url_data.has_result or cache.has_result(key): + if cache.has_result(key): + return + if url_data.has_result: self.queue.appendleft(url_data) else: assert key is not None, "no result for None key: %s" % url_data @@ -131,6 +133,7 @@ class UrlQueue (object): self.cleanup() self.queue.append(url_data) self.unfinished_tasks += 1 + cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times def cleanup(self): """Move cached elements to top.""" @@ -139,7 +142,7 @@ class UrlQueue (object): for i, url_data in enumerate(self.queue): key = url_data.cache_url cache = url_data.aggregate.result_cache - if cache.has_result(key): + if cache.has_non_empty_result(key): cached.append(i) for pos in cached: self._move_to_top(pos) From 3b8fe41206812bb330ec908b42e7c6e097eb329a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Thu, 26 Jan 2017 05:37:37 +0100 Subject: [PATCH 2/3] add tests for urlqueue --- tests/cache/__init__.py | 0 tests/cache/test_urlqueue.py | 174 +++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 tests/cache/__init__.py create mode 100644 tests/cache/test_urlqueue.py diff --git a/tests/cache/__init__.py b/tests/cache/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/cache/test_urlqueue.py b/tests/cache/test_urlqueue.py new file mode 100644 index 00000000..de10679e --- /dev/null +++ b/tests/cache/test_urlqueue.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2017 Petr DlouhĂ˝ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +import unittest +from collections import namedtuple + +from linkcheck.cache.results import ResultCache +from linkcheck.cache.urlqueue import Empty, NUM_PUTS_CLEANUP, UrlQueue + +UrlData = namedtuple('UrlData', 'url cache_url aggregate has_result') +Aggregate = namedtuple('Aggregate', 'result_cache') + + +class TestUrlQueue(unittest.TestCase): + + def setUp(self): + self.result_cache = ResultCache() + self.urlqueue = UrlQueue() + self.urldata1 = UrlData( + url="Foo", + cache_url="Foo", + aggregate=Aggregate( + result_cache=self.result_cache, + ), + has_result=True, + ) + + def test_max_allowed_urls_bad_value(self): + with self.assertRaises(ValueError): + UrlQueue(max_allowed_urls=0) + with self.assertRaises(ValueError): + UrlQueue(max_allowed_urls=-1) + + def test_qsize(self): + """ Test qsize() """ + self.assertEqual(self.urlqueue.qsize(), 0) + self.urlqueue.put(self.urldata1) + self.assertEqual(self.urlqueue.qsize(), 1) + + def test_empty(self): + """ Test empty() """ + self.assertEqual(self.urlqueue.empty(), True) + self.urlqueue.put(self.urldata1) + self.assertEqual(self.urlqueue.empty(), False) + + def test_get_empty(self): + """ Test, that get() with empty queue throws Empty """ + with self.assertRaises(Empty): + self.assertEqual(self.urlqueue.get(0), None) + + def test_get_negative_timeout(self): + """ Test, that get() with negative timeout throws ValueError """ + with self.assertRaises(ValueError): + self.assertEqual(self.urlqueue.get(-1), None) + + def test_put_get(self): + """ + Test, that after put() we can get() + the item and it can be get only once + """ + self.urlqueue.put(self.urldata1) + cached_item = ( + self.result_cache.get_result(self.urldata1) + ) + self.assertEqual(cached_item, None) + self.assertEqual(self.urlqueue.get(), self.urldata1) + with self.assertRaises(Empty): + self.assertEqual(self.urlqueue.get(0), None) + + def test_put_has_result_false(self): + """ + Test, that element with has_result=False + is put() on the end of queue + """ + self.urlqueue.put(self.urldata1) + urldata = UrlData( + url="Bar", + cache_url="Bar", + aggregate=Aggregate( + result_cache=self.result_cache, + ), + has_result=False, + ) + self.urlqueue.put(urldata) + self.assertEqual(self.urlqueue.get(), self.urldata1) + self.assertEqual(self.urlqueue.get(), urldata) + with self.assertRaises(Empty): + self.assertEqual(self.urlqueue.get(0), None) + + def test_put_has_result_true(self): + """ + Test, that element with has_result=True + is put() on the beginning of queue + """ + self.urlqueue.put(self.urldata1) + urldata = UrlData( + url="Bar", + cache_url="Bar", + aggregate=Aggregate( + result_cache=self.result_cache, + ), + has_result=True, + ) + self.urlqueue.put(urldata) + self.assertEqual(self.urlqueue.get(), urldata) + self.assertEqual(self.urlqueue.get(), self.urldata1) + with self.assertRaises(Empty): + self.assertEqual(self.urlqueue.get(0), None) + + def test_put_cache(self): + """ + Test, that making put() on two elements with same + cache_url adds only one element + """ + self.urlqueue.put(self.urldata1) + urldata = UrlData( + url="Bar", + cache_url="Foo", + aggregate=Aggregate( + result_cache=self.result_cache, + ), + has_result=True, + ) + self.urlqueue.put(urldata) + self.assertEqual(self.urlqueue.qsize(), 1) + self.assertEqual(self.urlqueue.get(), self.urldata1) + with self.assertRaises(Empty): + self.assertEqual(self.urlqueue.get(0), None) + + def test_cleanup(self): + """ + Test, that after adding NUM_PUTS_CLEANUP elements + the queue is cleaned up. + Whether the cleanup is was performed is determined, + that element in cache is now on top of the queue. + """ + for i in range(NUM_PUTS_CLEANUP - 1): + self.urlqueue.put( + UrlData( + url="Bar", + cache_url="Bar address %s" % i, + aggregate=Aggregate( + result_cache=self.result_cache, + ), + has_result=False, + ), + ) + self.assertEqual(self.urlqueue.qsize(), NUM_PUTS_CLEANUP - 1) + urldata = UrlData( + url="Bar", + cache_url="Bar address", + aggregate=Aggregate( + result_cache=self.result_cache, + ), + has_result=False, + ) + self.result_cache.add_result("Bar address 2", "asdf") + self.urlqueue.put(urldata) + self.assertEqual(self.urlqueue.qsize(), NUM_PUTS_CLEANUP) + self.assertEqual(self.urlqueue.get().cache_url, "Bar address 2") From f5100138ff2ff5785c0a91cdf9531ae0446c928e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Tue, 14 Feb 2017 10:47:15 +0100 Subject: [PATCH 3/3] fix tests that fail because of changed linkchecker output --- tests/checker/data/file.html.result | 6 ------ tests/checker/data/http.html.result | 5 ----- tests/checker/data/http_lowercase.html.result | 6 ------ tests/checker/data/http_slash.html.result | 6 ------ tests/checker/data/misc.html.result | 10 ---------- tests/checker/test_file.py | 5 ----- tests/checker/test_http_redirect.py | 7 ------- 7 files changed, 45 deletions(-) diff --git a/tests/checker/data/file.html.result b/tests/checker/data/file.html.result index 702e8994..a43ee61d 100644 --- a/tests/checker/data/file.html.result +++ b/tests/checker/data/file.html.result @@ -10,9 +10,3 @@ real url javascript:loadthis() name javascript url info Javascript URL ignored. valid - -url file.html -cache key file://%(curdir)s/%(datadir)s/file.html -real url file://%(curdir)s/%(datadir)s/file.html -name relative url -valid diff --git a/tests/checker/data/http.html.result b/tests/checker/data/http.html.result index 1a816a2e..344c4a57 100644 --- a/tests/checker/data/http.html.result +++ b/tests/checker/data/http.html.result @@ -9,11 +9,6 @@ real url clsid:12345-67890 info Clsid URL ignored. valid -url -cache key http://localhost:%(port)d/%(datadir)s/http.html -real url http://localhost:%(port)d/%(datadir)s/http.html -valid - url dns://www.example.org cache key dns://www.example.org real url dns://www.example.org diff --git a/tests/checker/data/http_lowercase.html.result b/tests/checker/data/http_lowercase.html.result index f79a8d3f..f3e62b9e 100644 --- a/tests/checker/data/http_lowercase.html.result +++ b/tests/checker/data/http_lowercase.html.result @@ -8,9 +8,3 @@ cache key http://www.example.com/ real url http://www.example.com/ name l1 valid - -url HTTP://WWW.EXAMPLE.COM/ -cache key http://www.example.com/ -real url http://www.example.com/ -name l2 -valid diff --git a/tests/checker/data/http_slash.html.result b/tests/checker/data/http_slash.html.result index 9b2ec171..837bf676 100644 --- a/tests/checker/data/http_slash.html.result +++ b/tests/checker/data/http_slash.html.result @@ -38,9 +38,3 @@ cache key http://www.example.com/ real url http://www.example.com/ name ok example valid - -url //www.example.com/ -cache key http://www.example.com/ -real url http://www.example.com/ -name no scheme example -valid diff --git a/tests/checker/data/misc.html.result b/tests/checker/data/misc.html.result index 9d77f8a6..e1868d8f 100644 --- a/tests/checker/data/misc.html.result +++ b/tests/checker/data/misc.html.result @@ -4,11 +4,6 @@ real url file://%(curdir)s/%(datadir)s/misc.html name %(datadir)s/misc.html valid -url -cache key file://%(curdir)s/%(datadir)s/misc.html -real url file://%(curdir)s/%(datadir)s/misc.html -valid - url http://www.example.com/ cache key http://www.example.com/ real url http://www.example.com/ @@ -19,11 +14,6 @@ cache key file://%(curdir)s/%(datadir)s/favicon.ico real url file://%(curdir)s/%(datadir)s/favicon.ico valid -url favicon.ico -cache key file://%(curdir)s/%(datadir)s/favicon.ico -real url file://%(curdir)s/%(datadir)s/favicon.ico -valid - url test.swf cache key file://%(curdir)s/%(datadir)s/test.swf real url file://%(curdir)s/%(datadir)s/test.swf diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py index ef42017f..3d77898c 100644 --- a/tests/checker/test_file.py +++ b/tests/checker/test_file.py @@ -169,10 +169,5 @@ class TestFile (LinkCheckTest): u"real url %s" % nurl3, u"name t.txt", u"valid", - u"url t.txt", - u"cache key %s" % nurl3, - u"real url %s" % nurl3, - u"name External link", - u"valid", ] self.direct(url, resultlines, recursionlevel=2) diff --git a/tests/checker/test_http_redirect.py b/tests/checker/test_http_redirect.py index 36df0739..f212d98e 100644 --- a/tests/checker/test_http_redirect.py +++ b/tests/checker/test_http_redirect.py @@ -57,13 +57,6 @@ class TestHttpRedirect (HttpServerTest): u"real url %s" % rurl, u"info Redirected to `%s'." % rurl, u"valid", - u"url newurl.html", - u"cache key %s" % nurl, - u"real url %s" % rurl, - u"name Recursive Redirect", - # XXX the info is copied from the cached result - u"info Redirected to `%s'." % rurl, - u"valid", ] self.direct(url, resultlines, recursionlevel=99)