mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-15 20:01:03 +00:00
Merge branch 'caching'
This commit is contained in:
commit
978b24f2d7
15 changed files with 127 additions and 43 deletions
57
linkcheck/cache/results.py
vendored
Normal file
57
linkcheck/cache/results.py
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Cache check results.
|
||||
"""
|
||||
from ..decorators import synchronized
|
||||
from ..lock import get_lock
|
||||
|
||||
|
||||
# lock object
|
||||
cache_lock = get_lock("results_cache_lock")
|
||||
|
||||
|
||||
class ResultCache(object):
|
||||
"""
|
||||
Thread-safe cache of UrlData.to_wire() results.
|
||||
the cache is limited in size since we rather recheck the same URL
|
||||
multiple times instead of running out of memory.
|
||||
format: {cache key (string) -> result (UrlData.towire())}
|
||||
"""
|
||||
|
||||
def __init__(self, max_size=10000):
|
||||
"""Initialize result cache."""
|
||||
# mapping {URL -> cached result}
|
||||
self.cache = {}
|
||||
self.max_size = max_size
|
||||
self.hits = self.misses = 0
|
||||
|
||||
@synchronized(cache_lock)
|
||||
def get_result(self, key):
|
||||
"""Return cached result or None if not found."""
|
||||
if key in self.cache:
|
||||
self.hits += 1
|
||||
return self.cache[key]
|
||||
self.misses += 1
|
||||
return None
|
||||
|
||||
@synchronized(cache_lock)
|
||||
def add_result(self, key, result):
|
||||
if len(self.cache) > self.max_size:
|
||||
return
|
||||
self.cache[key] = result
|
||||
|
||||
7
linkcheck/cache/urlqueue.py
vendored
7
linkcheck/cache/urlqueue.py
vendored
|
|
@ -119,7 +119,7 @@ class UrlQueue (object):
|
|||
"""
|
||||
if self.shutdown or self.allowed_puts == 0:
|
||||
return True
|
||||
if url_data.cache_url_key is not None and url_data.cache_url_key in self.seen:
|
||||
if url_data.cache_key is not None and url_data.cache_key in self.seen:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -134,14 +134,15 @@ class UrlQueue (object):
|
|||
return
|
||||
self.allowed_puts -= 1
|
||||
log.debug(LOG_CACHE, "queueing %s", url_data)
|
||||
key = url_data.cache_url_key
|
||||
key = url_data.cache_key
|
||||
if key is not None:
|
||||
if key in self.seen:
|
||||
# don't check duplicate URLs
|
||||
return
|
||||
self.seen.add(key)
|
||||
self.unfinished_tasks += 1
|
||||
if url_data.has_result:
|
||||
if url_data.has_result or \
|
||||
(key and key[1] in url_data.aggregate.result_cache.cache):
|
||||
self.queue.appendleft(url_data)
|
||||
else:
|
||||
self.queue.append(url_data)
|
||||
|
|
|
|||
|
|
@ -288,14 +288,13 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
pass
|
||||
self.set_result(_("Valid mail address syntax"))
|
||||
|
||||
def set_cache_keys (self):
|
||||
def set_cache_key(self):
|
||||
"""
|
||||
The cache key is a comma separated list of emails.
|
||||
"""
|
||||
emails = u",".join(sorted(self.addresses))
|
||||
self.cache_url_key = u"%s:%s" % (self.scheme, emails)
|
||||
assert isinstance(self.cache_url_key, unicode), self.cache_url_key
|
||||
# cache_content_key remains None, recursion is not allowed
|
||||
cache_url = u"%s:%s" % (self.scheme, emails)
|
||||
self.cache_key = (self.parent_url, cache_url)
|
||||
|
||||
def can_get_content (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -180,9 +180,8 @@ class UrlBase (object):
|
|||
self.url_connection = None
|
||||
# data of url content, (data == None) means no data is available
|
||||
self.data = None
|
||||
# cache keys, are set by build_url() calling set_cache_keys()
|
||||
self.cache_url_key = None
|
||||
self.cache_content_key = None
|
||||
# cache key is set by build_url() calling set_cache_key()
|
||||
self.cache_key = None
|
||||
# extern flags (is_extern, is_strict)
|
||||
self.extern = None
|
||||
# flag if the result should be cached
|
||||
|
|
@ -276,19 +275,14 @@ class UrlBase (object):
|
|||
if s not in self.info:
|
||||
self.info.append(s)
|
||||
|
||||
def set_cache_keys (self):
|
||||
"""
|
||||
Set keys for URL checking and content recursion.
|
||||
"""
|
||||
# remove anchor from content cache key since we assume
|
||||
def set_cache_key (self):
|
||||
"""Set key for URL cache checking. A cache key consists of
|
||||
a tuple (source url, target url)."""
|
||||
# remove anchor from cached target url since we assume
|
||||
# URLs with different anchors to have the same content
|
||||
self.cache_content_key = urlutil.urlunsplit(self.urlparts[:4]+[u''])
|
||||
assert isinstance(self.cache_content_key, unicode), self
|
||||
log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
|
||||
# construct cache key
|
||||
self.cache_url_key = self.cache_content_key
|
||||
assert isinstance(self.cache_url_key, unicode), self
|
||||
log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key)
|
||||
cache_url = urlutil.urlunsplit(self.urlparts[:4]+[u''])
|
||||
self.cache_key = (self.parent_url, cache_url)
|
||||
log.debug(LOG_CACHE, "URL cache key %r", self.cache_key)
|
||||
|
||||
def check_syntax (self):
|
||||
"""
|
||||
|
|
@ -310,7 +304,7 @@ class UrlBase (object):
|
|||
except tuple(ExcSyntaxList) as msg:
|
||||
self.set_result(unicode_safe(msg), valid=False)
|
||||
else:
|
||||
self.set_cache_keys()
|
||||
self.set_cache_key()
|
||||
|
||||
def check_url_warnings(self):
|
||||
"""Check URL name and length."""
|
||||
|
|
@ -675,8 +669,8 @@ class UrlBase (object):
|
|||
assert isinstance(self.name, unicode), self
|
||||
if self.anchor is not None:
|
||||
assert isinstance(self.anchor, unicode), self
|
||||
if self.cache_url_key is not None:
|
||||
assert isinstance(self.cache_url_key, unicode), self
|
||||
if self.cache_key is not None:
|
||||
assert isinstance(self.cache_key, unicode), self
|
||||
return sep.join([
|
||||
u"%s link" % self.scheme,
|
||||
u"base_url=%r" % self.base_url,
|
||||
|
|
@ -688,7 +682,7 @@ class UrlBase (object):
|
|||
u"column=%d" % self.column,
|
||||
u"name=%r" % self.name,
|
||||
u"anchor=%r" % self.anchor,
|
||||
u"cache_key=%r" % self.cache_url_key,
|
||||
u"cache_key=%r" % self.cache_key,
|
||||
])
|
||||
|
||||
def get_intern_pattern (self, url=None):
|
||||
|
|
@ -733,7 +727,7 @@ class UrlBase (object):
|
|||
return u"<%s>" % self.serialized(sep=u", ")
|
||||
|
||||
def to_wire_dict (self):
|
||||
"""Return a simplified transport object for logging.
|
||||
"""Return a simplified transport object for logging and caching.
|
||||
|
||||
The transport object must contain these attributes:
|
||||
- url_data.valid: bool
|
||||
|
|
@ -764,7 +758,7 @@ class UrlBase (object):
|
|||
Line number of this URL at parent document, or -1
|
||||
- url_data.column: int
|
||||
Column number of this URL at parent document, or -1
|
||||
- url_data.cache_url_key: unicode
|
||||
- url_data.cache_key: unicode
|
||||
Cache key for this URL.
|
||||
- url_data.content_type: unicode
|
||||
MIME content type for URL content.
|
||||
|
|
@ -790,7 +784,7 @@ class UrlBase (object):
|
|||
info=self.info,
|
||||
line=self.line,
|
||||
column=self.column,
|
||||
cache_url_key=self.cache_url_key,
|
||||
cache_key=self.cache_key,
|
||||
content_type=self.get_content_type(),
|
||||
level=self.recursion_level,
|
||||
modified=self.modified,
|
||||
|
|
@ -821,7 +815,7 @@ urlDataAttr = [
|
|||
'modified',
|
||||
'line',
|
||||
'column',
|
||||
'cache_url_key',
|
||||
'cache_key',
|
||||
'content_type',
|
||||
'level',
|
||||
]
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import thread
|
|||
import time
|
||||
from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \
|
||||
fileutil, strformat, plugins
|
||||
from ..cache import urlqueue, robots_txt
|
||||
from ..cache import urlqueue, robots_txt, results
|
||||
from . import aggregator, console
|
||||
|
||||
|
||||
|
|
@ -207,4 +207,6 @@ def get_aggregate (config):
|
|||
_urlqueue = urlqueue.UrlQueue(max_allowed_puts=config["maxnumurls"])
|
||||
_robots_txt = robots_txt.RobotsTxt()
|
||||
plugin_manager = plugins.PluginManager(config)
|
||||
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager)
|
||||
result_cache = results.ResultCache()
|
||||
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager,
|
||||
result_cache)
|
||||
|
|
|
|||
|
|
@ -44,7 +44,8 @@ def new_request_session(config):
|
|||
class Aggregate (object):
|
||||
"""Store thread-safe data collections for checker threads."""
|
||||
|
||||
def __init__ (self, config, urlqueue, robots_txt, plugin_manager):
|
||||
def __init__ (self, config, urlqueue, robots_txt, plugin_manager,
|
||||
result_cache):
|
||||
"""Store given link checking objects."""
|
||||
self.config = config
|
||||
self.urlqueue = urlqueue
|
||||
|
|
@ -53,6 +54,7 @@ class Aggregate (object):
|
|||
self.request_sessions = {}
|
||||
self.robots_txt = robots_txt
|
||||
self.plugin_manager = plugin_manager
|
||||
self.result_cache = result_cache
|
||||
self.times = {}
|
||||
requests_per_second = config["maxrequestspersecond"]
|
||||
self.wait_time_min = 1.0 / requests_per_second
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
"""
|
||||
URL checking functions.
|
||||
"""
|
||||
import copy
|
||||
from . import task
|
||||
from ..cache import urlqueue
|
||||
|
||||
|
|
@ -28,7 +29,7 @@ def check_url (urlqueue, logger):
|
|||
try:
|
||||
if not url_data.has_result:
|
||||
url_data.check()
|
||||
logger.log_url(url_data)
|
||||
logger.log_url(url_data.to_wire())
|
||||
finally:
|
||||
urlqueue.task_done(url_data)
|
||||
|
||||
|
|
@ -53,7 +54,7 @@ class Checker (task.LoggedCheckedTask):
|
|||
def check_url (self):
|
||||
"""Try to get URL data from queue and check it."""
|
||||
try:
|
||||
url_data = self.urlqueue.get(timeout=0.1)
|
||||
url_data = self.urlqueue.get(timeout=0.2)
|
||||
if url_data is not None:
|
||||
try:
|
||||
self.check_url_data(url_data)
|
||||
|
|
@ -70,6 +71,17 @@ class Checker (task.LoggedCheckedTask):
|
|||
else:
|
||||
url = url_data.url.encode("ascii", "replace")
|
||||
self.setName("CheckThread-%s" % url)
|
||||
if not url_data.has_result:
|
||||
url_data.check()
|
||||
self.logger.log_url(url_data)
|
||||
if url_data.has_result:
|
||||
self.logger.log_url(url_data.to_wire())
|
||||
else:
|
||||
cache = url_data.aggregate.result_cache
|
||||
key = url_data.cache_key[1]
|
||||
result = cache.get_result(key)
|
||||
if result is None:
|
||||
url_data.check()
|
||||
result = url_data.to_wire()
|
||||
cache.add_result(key, result)
|
||||
else:
|
||||
result = copy.copy(result)
|
||||
result.parent_url = url_data.parent_url
|
||||
self.logger.log_url(result)
|
||||
|
|
|
|||
|
|
@ -60,9 +60,8 @@ class Logger (object):
|
|||
do_print = self.do_print(url_data)
|
||||
# Only send a transport object to the loggers, not the complete
|
||||
# object instance.
|
||||
transport = url_data.to_wire()
|
||||
for log in self.loggers:
|
||||
log.log_filter_url(transport, do_print)
|
||||
log.log_filter_url(url_data, do_print)
|
||||
|
||||
@synchronized(_lock)
|
||||
def log_internal_error (self):
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ class BlacklistLogger (_Logger):
|
|||
"""
|
||||
Put invalid url in blacklist, delete valid url from blacklist.
|
||||
"""
|
||||
key = url_data.cache_url_key
|
||||
key = url_data.cache_key
|
||||
if key in self.blacklist:
|
||||
if url_data.valid:
|
||||
del self.blacklist[key]
|
||||
|
|
|
|||
|
|
@ -67,7 +67,8 @@ class TestLogger (linkcheck.logger._Logger):
|
|||
url = u"url %s" % url_data.base_url
|
||||
self.result.append(url)
|
||||
if self.has_part('cachekey'):
|
||||
self.result.append(u"cache key %s" % url_data.cache_url_key)
|
||||
cache_key = url_data.cache_key[1] if url_data.cache_key else None
|
||||
self.result.append(u"cache key %s" % cache_key)
|
||||
if self.has_part('realurl'):
|
||||
self.result.append(u"real url %s" % url_data.url)
|
||||
if self.has_part('name') and url_data.name:
|
||||
|
|
|
|||
|
|
@ -4,6 +4,12 @@ real url file://%(curdir)s/%(datadir)s/file.html
|
|||
name %(datadir)s/file.html
|
||||
valid
|
||||
|
||||
url file.html
|
||||
cache key file://%(curdir)s/%(datadir)s/file.html
|
||||
real url file://%(curdir)s/%(datadir)s/file.html
|
||||
name relative url
|
||||
valid
|
||||
|
||||
url javascript:loadthis()
|
||||
cache key javascript:loadthis()
|
||||
real url javascript:loadthis()
|
||||
|
|
|
|||
|
|
@ -37,6 +37,11 @@ real url http://localhost:%(port)d/?q=%%C3%%BC
|
|||
name html entities
|
||||
valid
|
||||
|
||||
url
|
||||
cache key http://localhost:%(port)d/tests/checker/data/http.html
|
||||
real url http://localhost:%(port)d/tests/checker/data/http.html
|
||||
valid
|
||||
|
||||
url file.css
|
||||
cache key http://localhost:%(port)d/tests/checker/data/file.css
|
||||
real url http://localhost:%(port)d/tests/checker/data/file.css
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ Define http test support classes for LinkChecker tests.
|
|||
import os
|
||||
import time
|
||||
import threading
|
||||
import pytest
|
||||
from ftplib import FTP
|
||||
from . import LinkCheckTest
|
||||
|
||||
|
|
|
|||
|
|
@ -145,6 +145,11 @@ class TestFile (LinkCheckTest):
|
|||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"valid",
|
||||
u"url bl.html",
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"name link",
|
||||
u"valid",
|
||||
u"url el.html",
|
||||
u"cache key %s" % nurl2,
|
||||
u"real url %s" % nurl2,
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ NNTP_INFO = u"200 news.uni-stuttgart.de InterNetNews NNRP server " \
|
|||
NNTP_TIMEOUT_SECS = 30
|
||||
|
||||
# disabled for now until some stable news server comes up
|
||||
class TestNews (LinkCheckTest):
|
||||
class _TestNews (LinkCheckTest):
|
||||
"""Test nntp: and news: link checking."""
|
||||
|
||||
def newstest (self, url, resultlines):
|
||||
|
|
|
|||
Loading…
Reference in a new issue