Merge branch 'caching'

This commit is contained in:
Bastian Kleineidam 2014-03-04 07:21:42 +01:00
commit 978b24f2d7
15 changed files with 127 additions and 43 deletions

57
linkcheck/cache/results.py vendored Normal file
View file

@ -0,0 +1,57 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Cache check results.
"""
from ..decorators import synchronized
from ..lock import get_lock
# lock object
cache_lock = get_lock("results_cache_lock")
class ResultCache(object):
"""
Thread-safe cache of UrlData.to_wire() results.
the cache is limited in size since we rather recheck the same URL
multiple times instead of running out of memory.
format: {cache key (string) -> result (UrlData.towire())}
"""
def __init__(self, max_size=10000):
"""Initialize result cache."""
# mapping {URL -> cached result}
self.cache = {}
self.max_size = max_size
self.hits = self.misses = 0
@synchronized(cache_lock)
def get_result(self, key):
"""Return cached result or None if not found."""
if key in self.cache:
self.hits += 1
return self.cache[key]
self.misses += 1
return None
@synchronized(cache_lock)
def add_result(self, key, result):
if len(self.cache) > self.max_size:
return
self.cache[key] = result

View file

@ -119,7 +119,7 @@ class UrlQueue (object):
"""
if self.shutdown or self.allowed_puts == 0:
return True
if url_data.cache_url_key is not None and url_data.cache_url_key in self.seen:
if url_data.cache_key is not None and url_data.cache_key in self.seen:
return True
return False
@ -134,14 +134,15 @@ class UrlQueue (object):
return
self.allowed_puts -= 1
log.debug(LOG_CACHE, "queueing %s", url_data)
key = url_data.cache_url_key
key = url_data.cache_key
if key is not None:
if key in self.seen:
# don't check duplicate URLs
return
self.seen.add(key)
self.unfinished_tasks += 1
if url_data.has_result:
if url_data.has_result or \
(key and key[1] in url_data.aggregate.result_cache.cache):
self.queue.appendleft(url_data)
else:
self.queue.append(url_data)

View file

@ -288,14 +288,13 @@ class MailtoUrl (urlbase.UrlBase):
pass
self.set_result(_("Valid mail address syntax"))
def set_cache_keys (self):
def set_cache_key(self):
"""
The cache key is a comma separated list of emails.
"""
emails = u",".join(sorted(self.addresses))
self.cache_url_key = u"%s:%s" % (self.scheme, emails)
assert isinstance(self.cache_url_key, unicode), self.cache_url_key
# cache_content_key remains None, recursion is not allowed
cache_url = u"%s:%s" % (self.scheme, emails)
self.cache_key = (self.parent_url, cache_url)
def can_get_content (self):
"""

View file

@ -180,9 +180,8 @@ class UrlBase (object):
self.url_connection = None
# data of url content, (data == None) means no data is available
self.data = None
# cache keys, are set by build_url() calling set_cache_keys()
self.cache_url_key = None
self.cache_content_key = None
# cache key is set by build_url() calling set_cache_key()
self.cache_key = None
# extern flags (is_extern, is_strict)
self.extern = None
# flag if the result should be cached
@ -276,19 +275,14 @@ class UrlBase (object):
if s not in self.info:
self.info.append(s)
def set_cache_keys (self):
"""
Set keys for URL checking and content recursion.
"""
# remove anchor from content cache key since we assume
def set_cache_key (self):
"""Set key for URL cache checking. A cache key consists of
a tuple (source url, target url)."""
# remove anchor from cached target url since we assume
# URLs with different anchors to have the same content
self.cache_content_key = urlutil.urlunsplit(self.urlparts[:4]+[u''])
assert isinstance(self.cache_content_key, unicode), self
log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
# construct cache key
self.cache_url_key = self.cache_content_key
assert isinstance(self.cache_url_key, unicode), self
log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key)
cache_url = urlutil.urlunsplit(self.urlparts[:4]+[u''])
self.cache_key = (self.parent_url, cache_url)
log.debug(LOG_CACHE, "URL cache key %r", self.cache_key)
def check_syntax (self):
"""
@ -310,7 +304,7 @@ class UrlBase (object):
except tuple(ExcSyntaxList) as msg:
self.set_result(unicode_safe(msg), valid=False)
else:
self.set_cache_keys()
self.set_cache_key()
def check_url_warnings(self):
"""Check URL name and length."""
@ -675,8 +669,8 @@ class UrlBase (object):
assert isinstance(self.name, unicode), self
if self.anchor is not None:
assert isinstance(self.anchor, unicode), self
if self.cache_url_key is not None:
assert isinstance(self.cache_url_key, unicode), self
if self.cache_key is not None:
assert isinstance(self.cache_key, unicode), self
return sep.join([
u"%s link" % self.scheme,
u"base_url=%r" % self.base_url,
@ -688,7 +682,7 @@ class UrlBase (object):
u"column=%d" % self.column,
u"name=%r" % self.name,
u"anchor=%r" % self.anchor,
u"cache_key=%r" % self.cache_url_key,
u"cache_key=%r" % self.cache_key,
])
def get_intern_pattern (self, url=None):
@ -733,7 +727,7 @@ class UrlBase (object):
return u"<%s>" % self.serialized(sep=u", ")
def to_wire_dict (self):
"""Return a simplified transport object for logging.
"""Return a simplified transport object for logging and caching.
The transport object must contain these attributes:
- url_data.valid: bool
@ -764,7 +758,7 @@ class UrlBase (object):
Line number of this URL at parent document, or -1
- url_data.column: int
Column number of this URL at parent document, or -1
- url_data.cache_url_key: unicode
- url_data.cache_key: unicode
Cache key for this URL.
- url_data.content_type: unicode
MIME content type for URL content.
@ -790,7 +784,7 @@ class UrlBase (object):
info=self.info,
line=self.line,
column=self.column,
cache_url_key=self.cache_url_key,
cache_key=self.cache_key,
content_type=self.get_content_type(),
level=self.recursion_level,
modified=self.modified,
@ -821,7 +815,7 @@ urlDataAttr = [
'modified',
'line',
'column',
'cache_url_key',
'cache_key',
'content_type',
'level',
]

View file

@ -22,7 +22,7 @@ import thread
import time
from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \
fileutil, strformat, plugins
from ..cache import urlqueue, robots_txt
from ..cache import urlqueue, robots_txt, results
from . import aggregator, console
@ -207,4 +207,6 @@ def get_aggregate (config):
_urlqueue = urlqueue.UrlQueue(max_allowed_puts=config["maxnumurls"])
_robots_txt = robots_txt.RobotsTxt()
plugin_manager = plugins.PluginManager(config)
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager)
result_cache = results.ResultCache()
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager,
result_cache)

View file

@ -44,7 +44,8 @@ def new_request_session(config):
class Aggregate (object):
"""Store thread-safe data collections for checker threads."""
def __init__ (self, config, urlqueue, robots_txt, plugin_manager):
def __init__ (self, config, urlqueue, robots_txt, plugin_manager,
result_cache):
"""Store given link checking objects."""
self.config = config
self.urlqueue = urlqueue
@ -53,6 +54,7 @@ class Aggregate (object):
self.request_sessions = {}
self.robots_txt = robots_txt
self.plugin_manager = plugin_manager
self.result_cache = result_cache
self.times = {}
requests_per_second = config["maxrequestspersecond"]
self.wait_time_min = 1.0 / requests_per_second

View file

@ -17,6 +17,7 @@
"""
URL checking functions.
"""
import copy
from . import task
from ..cache import urlqueue
@ -28,7 +29,7 @@ def check_url (urlqueue, logger):
try:
if not url_data.has_result:
url_data.check()
logger.log_url(url_data)
logger.log_url(url_data.to_wire())
finally:
urlqueue.task_done(url_data)
@ -53,7 +54,7 @@ class Checker (task.LoggedCheckedTask):
def check_url (self):
"""Try to get URL data from queue and check it."""
try:
url_data = self.urlqueue.get(timeout=0.1)
url_data = self.urlqueue.get(timeout=0.2)
if url_data is not None:
try:
self.check_url_data(url_data)
@ -70,6 +71,17 @@ class Checker (task.LoggedCheckedTask):
else:
url = url_data.url.encode("ascii", "replace")
self.setName("CheckThread-%s" % url)
if not url_data.has_result:
url_data.check()
self.logger.log_url(url_data)
if url_data.has_result:
self.logger.log_url(url_data.to_wire())
else:
cache = url_data.aggregate.result_cache
key = url_data.cache_key[1]
result = cache.get_result(key)
if result is None:
url_data.check()
result = url_data.to_wire()
cache.add_result(key, result)
else:
result = copy.copy(result)
result.parent_url = url_data.parent_url
self.logger.log_url(result)

View file

@ -60,9 +60,8 @@ class Logger (object):
do_print = self.do_print(url_data)
# Only send a transport object to the loggers, not the complete
# object instance.
transport = url_data.to_wire()
for log in self.loggers:
log.log_filter_url(transport, do_print)
log.log_filter_url(url_data, do_print)
@synchronized(_lock)
def log_internal_error (self):

View file

@ -55,7 +55,7 @@ class BlacklistLogger (_Logger):
"""
Put invalid url in blacklist, delete valid url from blacklist.
"""
key = url_data.cache_url_key
key = url_data.cache_key
if key in self.blacklist:
if url_data.valid:
del self.blacklist[key]

View file

@ -67,7 +67,8 @@ class TestLogger (linkcheck.logger._Logger):
url = u"url %s" % url_data.base_url
self.result.append(url)
if self.has_part('cachekey'):
self.result.append(u"cache key %s" % url_data.cache_url_key)
cache_key = url_data.cache_key[1] if url_data.cache_key else None
self.result.append(u"cache key %s" % cache_key)
if self.has_part('realurl'):
self.result.append(u"real url %s" % url_data.url)
if self.has_part('name') and url_data.name:

View file

@ -4,6 +4,12 @@ real url file://%(curdir)s/%(datadir)s/file.html
name %(datadir)s/file.html
valid
url file.html
cache key file://%(curdir)s/%(datadir)s/file.html
real url file://%(curdir)s/%(datadir)s/file.html
name relative url
valid
url javascript:loadthis()
cache key javascript:loadthis()
real url javascript:loadthis()

View file

@ -37,6 +37,11 @@ real url http://localhost:%(port)d/?q=%%C3%%BC
name html entities
valid
url
cache key http://localhost:%(port)d/tests/checker/data/http.html
real url http://localhost:%(port)d/tests/checker/data/http.html
valid
url file.css
cache key http://localhost:%(port)d/tests/checker/data/file.css
real url http://localhost:%(port)d/tests/checker/data/file.css

View file

@ -20,6 +20,7 @@ Define http test support classes for LinkChecker tests.
import os
import time
import threading
import pytest
from ftplib import FTP
from . import LinkCheckTest

View file

@ -145,6 +145,11 @@ class TestFile (LinkCheckTest):
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"valid",
u"url bl.html",
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"name link",
u"valid",
u"url el.html",
u"cache key %s" % nurl2,
u"real url %s" % nurl2,

View file

@ -30,7 +30,7 @@ NNTP_INFO = u"200 news.uni-stuttgart.de InterNetNews NNRP server " \
NNTP_TIMEOUT_SECS = 30
# disabled for now until some stable news server comes up
class TestNews (LinkCheckTest):
class _TestNews (LinkCheckTest):
"""Test nntp: and news: link checking."""
def newstest (self, url, resultlines):