diff --git a/linkcheck/cache/results.py b/linkcheck/cache/results.py new file mode 100644 index 00000000..84156fc1 --- /dev/null +++ b/linkcheck/cache/results.py @@ -0,0 +1,57 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2014 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Cache check results. +""" +from ..decorators import synchronized +from ..lock import get_lock + + +# lock object +cache_lock = get_lock("results_cache_lock") + + +class ResultCache(object): + """ + Thread-safe cache of UrlData.to_wire() results. + the cache is limited in size since we rather recheck the same URL + multiple times instead of running out of memory. + format: {cache key (string) -> result (UrlData.towire())} + """ + + def __init__(self, max_size=10000): + """Initialize result cache.""" + # mapping {URL -> cached result} + self.cache = {} + self.max_size = max_size + self.hits = self.misses = 0 + + @synchronized(cache_lock) + def get_result(self, key): + """Return cached result or None if not found.""" + if key in self.cache: + self.hits += 1 + return self.cache[key] + self.misses += 1 + return None + + @synchronized(cache_lock) + def add_result(self, key, result): + if len(self.cache) > self.max_size: + return + self.cache[key] = result + diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py index fe3f26a5..8f46ef36 100644 --- a/linkcheck/cache/urlqueue.py +++ b/linkcheck/cache/urlqueue.py @@ -119,7 +119,7 @@ class UrlQueue (object): """ if self.shutdown or self.allowed_puts == 0: return True - if url_data.cache_url_key is not None and url_data.cache_url_key in self.seen: + if url_data.cache_key is not None and url_data.cache_key in self.seen: return True return False @@ -134,14 +134,15 @@ class UrlQueue (object): return self.allowed_puts -= 1 log.debug(LOG_CACHE, "queueing %s", url_data) - key = url_data.cache_url_key + key = url_data.cache_key if key is not None: if key in self.seen: # don't check duplicate URLs return self.seen.add(key) self.unfinished_tasks += 1 - if url_data.has_result: + if url_data.has_result or \ + (key and key[1] in url_data.aggregate.result_cache.cache): self.queue.appendleft(url_data) else: self.queue.append(url_data) diff --git a/linkcheck/checker/mailtourl.py b/linkcheck/checker/mailtourl.py index 5abaa22c..190b1a04 100644 --- a/linkcheck/checker/mailtourl.py +++ b/linkcheck/checker/mailtourl.py @@ -288,14 +288,13 @@ class MailtoUrl (urlbase.UrlBase): pass self.set_result(_("Valid mail address syntax")) - def set_cache_keys (self): + def set_cache_key(self): """ The cache key is a comma separated list of emails. """ emails = u",".join(sorted(self.addresses)) - self.cache_url_key = u"%s:%s" % (self.scheme, emails) - assert isinstance(self.cache_url_key, unicode), self.cache_url_key - # cache_content_key remains None, recursion is not allowed + cache_url = u"%s:%s" % (self.scheme, emails) + self.cache_key = (self.parent_url, cache_url) def can_get_content (self): """ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index b3103f06..afb44da6 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -180,9 +180,8 @@ class UrlBase (object): self.url_connection = None # data of url content, (data == None) means no data is available self.data = None - # cache keys, are set by build_url() calling set_cache_keys() - self.cache_url_key = None - self.cache_content_key = None + # cache key is set by build_url() calling set_cache_key() + self.cache_key = None # extern flags (is_extern, is_strict) self.extern = None # flag if the result should be cached @@ -276,19 +275,14 @@ class UrlBase (object): if s not in self.info: self.info.append(s) - def set_cache_keys (self): - """ - Set keys for URL checking and content recursion. - """ - # remove anchor from content cache key since we assume + def set_cache_key (self): + """Set key for URL cache checking. A cache key consists of + a tuple (source url, target url).""" + # remove anchor from cached target url since we assume # URLs with different anchors to have the same content - self.cache_content_key = urlutil.urlunsplit(self.urlparts[:4]+[u'']) - assert isinstance(self.cache_content_key, unicode), self - log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key) - # construct cache key - self.cache_url_key = self.cache_content_key - assert isinstance(self.cache_url_key, unicode), self - log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key) + cache_url = urlutil.urlunsplit(self.urlparts[:4]+[u'']) + self.cache_key = (self.parent_url, cache_url) + log.debug(LOG_CACHE, "URL cache key %r", self.cache_key) def check_syntax (self): """ @@ -310,7 +304,7 @@ class UrlBase (object): except tuple(ExcSyntaxList) as msg: self.set_result(unicode_safe(msg), valid=False) else: - self.set_cache_keys() + self.set_cache_key() def check_url_warnings(self): """Check URL name and length.""" @@ -675,8 +669,8 @@ class UrlBase (object): assert isinstance(self.name, unicode), self if self.anchor is not None: assert isinstance(self.anchor, unicode), self - if self.cache_url_key is not None: - assert isinstance(self.cache_url_key, unicode), self + if self.cache_key is not None: + assert isinstance(self.cache_key, unicode), self return sep.join([ u"%s link" % self.scheme, u"base_url=%r" % self.base_url, @@ -688,7 +682,7 @@ class UrlBase (object): u"column=%d" % self.column, u"name=%r" % self.name, u"anchor=%r" % self.anchor, - u"cache_key=%r" % self.cache_url_key, + u"cache_key=%r" % self.cache_key, ]) def get_intern_pattern (self, url=None): @@ -733,7 +727,7 @@ class UrlBase (object): return u"<%s>" % self.serialized(sep=u", ") def to_wire_dict (self): - """Return a simplified transport object for logging. + """Return a simplified transport object for logging and caching. The transport object must contain these attributes: - url_data.valid: bool @@ -764,7 +758,7 @@ class UrlBase (object): Line number of this URL at parent document, or -1 - url_data.column: int Column number of this URL at parent document, or -1 - - url_data.cache_url_key: unicode + - url_data.cache_key: unicode Cache key for this URL. - url_data.content_type: unicode MIME content type for URL content. @@ -790,7 +784,7 @@ class UrlBase (object): info=self.info, line=self.line, column=self.column, - cache_url_key=self.cache_url_key, + cache_key=self.cache_key, content_type=self.get_content_type(), level=self.recursion_level, modified=self.modified, @@ -821,7 +815,7 @@ urlDataAttr = [ 'modified', 'line', 'column', - 'cache_url_key', + 'cache_key', 'content_type', 'level', ] diff --git a/linkcheck/director/__init__.py b/linkcheck/director/__init__.py index 2c3db4bc..4f4508e9 100644 --- a/linkcheck/director/__init__.py +++ b/linkcheck/director/__init__.py @@ -22,7 +22,7 @@ import thread import time from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \ fileutil, strformat, plugins -from ..cache import urlqueue, robots_txt +from ..cache import urlqueue, robots_txt, results from . import aggregator, console @@ -207,4 +207,6 @@ def get_aggregate (config): _urlqueue = urlqueue.UrlQueue(max_allowed_puts=config["maxnumurls"]) _robots_txt = robots_txt.RobotsTxt() plugin_manager = plugins.PluginManager(config) - return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager) + result_cache = results.ResultCache() + return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager, + result_cache) diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py index 4f373d0e..85059e3c 100644 --- a/linkcheck/director/aggregator.py +++ b/linkcheck/director/aggregator.py @@ -44,7 +44,8 @@ def new_request_session(config): class Aggregate (object): """Store thread-safe data collections for checker threads.""" - def __init__ (self, config, urlqueue, robots_txt, plugin_manager): + def __init__ (self, config, urlqueue, robots_txt, plugin_manager, + result_cache): """Store given link checking objects.""" self.config = config self.urlqueue = urlqueue @@ -53,6 +54,7 @@ class Aggregate (object): self.request_sessions = {} self.robots_txt = robots_txt self.plugin_manager = plugin_manager + self.result_cache = result_cache self.times = {} requests_per_second = config["maxrequestspersecond"] self.wait_time_min = 1.0 / requests_per_second diff --git a/linkcheck/director/checker.py b/linkcheck/director/checker.py index b90a4890..ed140bf8 100644 --- a/linkcheck/director/checker.py +++ b/linkcheck/director/checker.py @@ -17,6 +17,7 @@ """ URL checking functions. """ +import copy from . import task from ..cache import urlqueue @@ -28,7 +29,7 @@ def check_url (urlqueue, logger): try: if not url_data.has_result: url_data.check() - logger.log_url(url_data) + logger.log_url(url_data.to_wire()) finally: urlqueue.task_done(url_data) @@ -53,7 +54,7 @@ class Checker (task.LoggedCheckedTask): def check_url (self): """Try to get URL data from queue and check it.""" try: - url_data = self.urlqueue.get(timeout=0.1) + url_data = self.urlqueue.get(timeout=0.2) if url_data is not None: try: self.check_url_data(url_data) @@ -70,6 +71,17 @@ class Checker (task.LoggedCheckedTask): else: url = url_data.url.encode("ascii", "replace") self.setName("CheckThread-%s" % url) - if not url_data.has_result: - url_data.check() - self.logger.log_url(url_data) + if url_data.has_result: + self.logger.log_url(url_data.to_wire()) + else: + cache = url_data.aggregate.result_cache + key = url_data.cache_key[1] + result = cache.get_result(key) + if result is None: + url_data.check() + result = url_data.to_wire() + cache.add_result(key, result) + else: + result = copy.copy(result) + result.parent_url = url_data.parent_url + self.logger.log_url(result) diff --git a/linkcheck/director/logger.py b/linkcheck/director/logger.py index 5f0a9609..22fe8a70 100644 --- a/linkcheck/director/logger.py +++ b/linkcheck/director/logger.py @@ -60,9 +60,8 @@ class Logger (object): do_print = self.do_print(url_data) # Only send a transport object to the loggers, not the complete # object instance. - transport = url_data.to_wire() for log in self.loggers: - log.log_filter_url(transport, do_print) + log.log_filter_url(url_data, do_print) @synchronized(_lock) def log_internal_error (self): diff --git a/linkcheck/logger/blacklist.py b/linkcheck/logger/blacklist.py index 3d11e151..1f041fa5 100644 --- a/linkcheck/logger/blacklist.py +++ b/linkcheck/logger/blacklist.py @@ -55,7 +55,7 @@ class BlacklistLogger (_Logger): """ Put invalid url in blacklist, delete valid url from blacklist. """ - key = url_data.cache_url_key + key = url_data.cache_key if key in self.blacklist: if url_data.valid: del self.blacklist[key] diff --git a/tests/checker/__init__.py b/tests/checker/__init__.py index 13a7024f..0aeb2f88 100644 --- a/tests/checker/__init__.py +++ b/tests/checker/__init__.py @@ -67,7 +67,8 @@ class TestLogger (linkcheck.logger._Logger): url = u"url %s" % url_data.base_url self.result.append(url) if self.has_part('cachekey'): - self.result.append(u"cache key %s" % url_data.cache_url_key) + cache_key = url_data.cache_key[1] if url_data.cache_key else None + self.result.append(u"cache key %s" % cache_key) if self.has_part('realurl'): self.result.append(u"real url %s" % url_data.url) if self.has_part('name') and url_data.name: diff --git a/tests/checker/data/file.html.result b/tests/checker/data/file.html.result index a43ee61d..167c5fe7 100644 --- a/tests/checker/data/file.html.result +++ b/tests/checker/data/file.html.result @@ -4,6 +4,12 @@ real url file://%(curdir)s/%(datadir)s/file.html name %(datadir)s/file.html valid +url file.html +cache key file://%(curdir)s/%(datadir)s/file.html +real url file://%(curdir)s/%(datadir)s/file.html +name relative url +valid + url javascript:loadthis() cache key javascript:loadthis() real url javascript:loadthis() diff --git a/tests/checker/data/http.html.result b/tests/checker/data/http.html.result index 27debb66..b3c33b71 100644 --- a/tests/checker/data/http.html.result +++ b/tests/checker/data/http.html.result @@ -37,6 +37,11 @@ real url http://localhost:%(port)d/?q=%%C3%%BC name html entities valid +url +cache key http://localhost:%(port)d/tests/checker/data/http.html +real url http://localhost:%(port)d/tests/checker/data/http.html +valid + url file.css cache key http://localhost:%(port)d/tests/checker/data/file.css real url http://localhost:%(port)d/tests/checker/data/file.css diff --git a/tests/checker/ftpserver.py b/tests/checker/ftpserver.py index ea35a99b..5e8680fa 100644 --- a/tests/checker/ftpserver.py +++ b/tests/checker/ftpserver.py @@ -20,6 +20,7 @@ Define http test support classes for LinkChecker tests. import os import time import threading +import pytest from ftplib import FTP from . import LinkCheckTest diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py index c8fe837a..f5211c49 100644 --- a/tests/checker/test_file.py +++ b/tests/checker/test_file.py @@ -145,6 +145,11 @@ class TestFile (LinkCheckTest): u"cache key %s" % nurl, u"real url %s" % nurl, u"valid", + u"url bl.html", + u"cache key %s" % nurl, + u"real url %s" % nurl, + u"name link", + u"valid", u"url el.html", u"cache key %s" % nurl2, u"real url %s" % nurl2, diff --git a/tests/checker/test_news.py b/tests/checker/test_news.py index 00a89e39..95659bde 100644 --- a/tests/checker/test_news.py +++ b/tests/checker/test_news.py @@ -30,7 +30,7 @@ NNTP_INFO = u"200 news.uni-stuttgart.de InterNetNews NNRP server " \ NNTP_TIMEOUT_SECS = 30 # disabled for now until some stable news server comes up -class TestNews (LinkCheckTest): +class _TestNews (LinkCheckTest): """Test nntp: and news: link checking.""" def newstest (self, url, resultlines):