Fix URL result caching.

This commit is contained in:
Bastian Kleineidam 2014-03-08 19:35:10 +01:00
parent 0113f06406
commit 6b334dc79b
18 changed files with 151 additions and 121 deletions

View file

@ -7,7 +7,8 @@ Features:
Closes: GH bug #467
Fixes:
- checking: Fix internal error in debug output.
- checking: Fix internal errors in debug output.
Closes: GH bug #472
9.0 "The Wolf of Wall Street" (released 3.3.2014)

View file

@ -33,28 +33,28 @@ class ResultCache(object):
format: {cache key (string) -> result (UrlData.towire())}
"""
def __init__(self, max_size=10000):
def __init__(self, max_size=100000):
"""Initialize result cache."""
# mapping {URL -> cached result}
self.cache = {}
self.max_size = max_size
self.hits = self.misses = 0
@synchronized(cache_lock)
def get_result(self, key):
"""Return cached result or None if not found."""
if key in self.cache:
self.hits += 1
return self.cache[key]
self.misses += 1
return None
return self.cache.get(key)
@synchronized(cache_lock)
def add_result(self, key, result):
"""Add result object to cache with given key.
The request is ignored when the cache is already full.
The request is ignored when the cache is already full or the key
is None.
"""
if len(self.cache) > self.max_size:
return
self.cache[key] = result
if key is not None:
self.cache[key] = result
def has_result(self, key):
"""Non-thread-safe function for fast containment checks."""
return key in self.cache

View file

@ -80,9 +80,7 @@ class RobotsTxt (object):
url_data.recursion_level+1, url_data.aggregate,
parent_url=roboturl, line=line,
parent_content_type=url_data.content_type)
if sitemap_url_data.has_result or not sitemap_url_data.extern[1]:
# Only queue URLs which have a result or are not strict extern.
url_data.aggregate.urlqueue.put(sitemap_url_data)
url_data.aggregate.urlqueue.put(sitemap_url_data)
@synchronized(robot_lock)
def get_lock(self, roboturl):

View file

@ -53,7 +53,6 @@ class UrlQueue (object):
self.unfinished_tasks = 0
self.finished_tasks = 0
self.in_progress = 0
self.seen = set()
self.shutdown = False
# Each put() decreases the number of allowed puts.
# This way we can restrict the number of URLs that are checked.
@ -107,8 +106,6 @@ class UrlQueue (object):
"""Put an item into the queue.
Block if necessary until a free slot is available.
"""
if self.put_denied(item):
return
with self.mutex:
self._put(item)
self.not_empty.notify()
@ -117,35 +114,23 @@ class UrlQueue (object):
"""Determine if put() will not append the item on the queue.
@return True (reliable) or False (unreliable)
"""
if self.shutdown or self.allowed_puts == 0:
return True
if url_data.cache_key is not None and url_data.cache_key in self.seen:
return True
return False
return self.shutdown or self.allowed_puts == 0
def _put (self, url_data):
"""Put URL in queue, increase number of unfished tasks."""
if self.shutdown:
# don't accept more URLs
if self.put_denied(url_data):
return
if self.allowed_puts is not None:
if self.allowed_puts == 0:
# no more puts allowed
return
self.allowed_puts -= 1
log.debug(LOG_CACHE, "queueing %s", url_data)
key = url_data.cache_key
if key is not None:
if key in self.seen:
# don't check duplicate URLs
return
self.seen.add(key)
self.unfinished_tasks += 1
if url_data.has_result or \
(key and key[1] in url_data.aggregate.result_cache.cache):
log.debug(LOG_CACHE, "queueing %s", url_data.url)
key = url_data.cache_url
cache = url_data.aggregate.result_cache
if url_data.has_result or cache.has_result(key):
self.queue.appendleft(url_data)
else:
assert key is not None, "no result for None key: %s" % url_data
self.queue.append(url_data)
self.unfinished_tasks += 1
def task_done (self, url_data):
"""
@ -163,10 +148,7 @@ class UrlQueue (object):
placed in the queue.
"""
with self.all_tasks_done:
log.debug(LOG_CACHE, "task_done %s", url_data)
# check for aliases (eg. through HTTP redirections)
if hasattr(url_data, "aliases") and url_data.aliases:
self.seen.update(url_data.aliases)
log.debug(LOG_CACHE, "task_done %s", url_data.url)
self.finished_tasks += 1
self.unfinished_tasks -= 1
self.in_progress -= 1

View file

@ -44,8 +44,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
Initialize HTTP specific variables.
"""
super(HttpUrl, self).reset()
# URLs seen through redirections
self.aliases = []
# initialize check data
self.headers = {}
self.auth = None

View file

@ -288,13 +288,12 @@ class MailtoUrl (urlbase.UrlBase):
pass
self.set_result(_("Valid mail address syntax"))
def set_cache_key(self):
def set_cache_url(self):
"""
The cache key is a comma separated list of emails.
The cache url is a comma separated list of emails.
"""
emails = u",".join(sorted(self.addresses))
cache_url = u"%s:%s" % (self.scheme, emails)
self.cache_key = (self.parent_url, cache_url)
self.cache_url = u"%s:%s" % (self.scheme, emails)
def can_get_content (self):
"""

View file

@ -29,8 +29,8 @@ import select
from cStringIO import StringIO
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK, LOG_CACHE,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat, parser)
from .. import (log, LOG_CHECK,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from ..network import iputil
@ -182,8 +182,8 @@ class UrlBase (object):
self.url_connection = None
# data of url content, (data == None) means no data is available
self.data = None
# cache key is set by build_url() calling set_cache_key()
self.cache_key = None
# cache url is set by build_url() calling set_cache_url()
self.cache_url = None
# extern flags (is_extern, is_strict)
self.extern = None
# flag if the result should be cached
@ -194,6 +194,8 @@ class UrlBase (object):
self.do_check_content = True
# MIME content type
self.content_type = u""
# URLs seen through redirections
self.aliases = []
def set_result (self, msg, valid=True, overwrite=False):
"""
@ -282,14 +284,11 @@ class UrlBase (object):
if s not in self.info:
self.info.append(s)
def set_cache_key (self):
"""Set key for URL cache checking. A cache key consists of
a tuple (source url, target url)."""
def set_cache_url (self):
"""Set the URL to be used for caching."""
# remove anchor from cached target url since we assume
# URLs with different anchors to have the same content
cache_url = urlutil.urlunsplit(self.urlparts[:4]+[u''])
self.cache_key = (self.parent_url, cache_url)
log.debug(LOG_CACHE, "URL cache key %r", self.cache_key)
self.cache_url = urlutil.urlunsplit(self.urlparts[:4]+[u''])
def check_syntax (self):
"""
@ -311,7 +310,7 @@ class UrlBase (object):
except tuple(ExcSyntaxList) as msg:
self.set_result(unicode_safe(msg), valid=False)
else:
self.set_cache_key()
self.set_cache_url()
def check_url_warnings(self):
"""Check URL name and length."""
@ -414,15 +413,10 @@ class UrlBase (object):
raise KeyboardInterrupt(value)
else:
raise
finally:
# close/release possible open connection
self.close_connection()
def local_check (self):
"""Local check function can be overridden in subclasses."""
log.debug(LOG_CHECK, "Checking %s", self)
# start time for check
check_start = time.time()
log.debug(LOG_CHECK, "Checking %s", unicode(self))
# strict extern URLs should not be checked
assert not self.extern[1], 'checking strict extern URL'
# check connection
@ -441,18 +435,23 @@ class UrlBase (object):
# idna.encode(host) failed
value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': str(value)}
self.set_result(unicode_safe(value), valid=False)
if self.do_check_content:
def check_content(self):
"""Check content of URL.
@return: True if content can be parsed, else False
"""
if self.do_check_content and self.valid:
# check content and recursion
try:
if self.valid and self.can_get_content():
if self.can_get_content():
self.aggregate.plugin_manager.run_content_plugins(self)
if self.allows_recursion():
parser.parse_url(self)
return True
except tuple(ExcList):
value = self.handle_exception()
self.add_warning(_("could not get content: %(msg)s") %
{"msg": str(value)}, tag=WARN_URL_ERROR_GETTING_CONTENT)
self.checktime = time.time() - check_start
return False
def close_connection (self):
"""
@ -648,7 +647,7 @@ class UrlBase (object):
return self.aggregate.config.get_user_password(self.url)
def add_url (self, url, line=0, column=0, name=u"", base=None):
"""Queue URL data for checking."""
"""Add new URL to queue."""
if base:
base_ref = urlutil.url_norm(base)[0]
else:
@ -656,9 +655,7 @@ class UrlBase (object):
url_data = get_url_from(url, self.recursion_level+1, self.aggregate,
parent_url=self.url, base_ref=base_ref, line=line, column=column,
name=name, parent_content_type=self.content_type)
if url_data.has_result or not url_data.extern[1]:
# Only queue URLs which have a result or are not strict extern.
self.aggregate.urlqueue.put(url_data)
self.aggregate.urlqueue.put(url_data)
def serialized (self, sep=os.linesep):
"""
@ -674,11 +671,8 @@ class UrlBase (object):
assert isinstance(self.name, unicode), repr(self.name)
if self.anchor is not None:
assert isinstance(self.anchor, unicode), repr(self.anchor)
if self.cache_key is not None:
if self.cache_key[0] is not None:
assert isinstance(self.cache_key[0], unicode), repr(self.cache_key[0])
if self.cache_key[1] is not None:
assert isinstance(self.cache_key[1], unicode), repr(self.cache_key[1])
if self.cache_url is not None:
assert isinstance(self.cache_url, unicode), repr(self.cache_url)
return sep.join([
u"%s link" % self.scheme,
u"base_url=%r" % self.base_url,
@ -690,7 +684,7 @@ class UrlBase (object):
u"column=%d" % self.column,
u"name=%r" % self.name,
u"anchor=%r" % self.anchor,
u"cache_key=%s" % repr(self.cache_key),
u"cache_url=%s" % self.cache_url,
])
def get_intern_pattern (self, url=None):
@ -715,14 +709,23 @@ class UrlBase (object):
{"domain": msg}
self.set_result(res, valid=False)
def __str__ (self):
def __unicode__(self):
"""
Get URL info.
@return: URL info
@rtype: unicode
"""
return self.serialized()
def __str__(self):
"""
Get URL info.
@return: URL info, encoded with the output logger encoding
@rtype: string
"""
s = self.serialized()
s = unicode(self)
return self.aggregate.config['logger'].encode(s)
def __repr__ (self):
@ -766,8 +769,8 @@ class UrlBase (object):
Line number of this URL at parent document, or -1
- url_data.column: int
Column number of this URL at parent document, or -1
- url_data.cache_key: unicode
Cache key for this URL.
- url_data.cache_url: unicode
Cache url for this URL.
- url_data.content_type: unicode
MIME content type for URL content.
- url_data.level: int
@ -792,7 +795,7 @@ class UrlBase (object):
info=self.info,
line=self.line,
column=self.column,
cache_key=self.cache_key,
cache_url=self.cache_url,
content_type=self.content_type,
level=self.recursion_level,
modified=self.modified,
@ -823,7 +826,7 @@ urlDataAttr = [
'modified',
'line',
'column',
'cache_key',
'cache_url',
'content_type',
'level',
]

View file

@ -81,7 +81,7 @@ class Aggregate (object):
t.start()
else:
self.request_sessions[thread.get_ident()] = new_request_session(self.config)
checker.check_url(self.urlqueue, self.logger)
checker.check_urls(self.urlqueue, self.logger)
@synchronized(_threads_lock)
def add_request_session(self):

View file

@ -18,23 +18,63 @@
URL checking functions.
"""
import copy
import time
from . import task
from ..cache import urlqueue
from .. import parser
def check_url (urlqueue, logger):
def check_urls (urlqueue, logger):
"""Check URLs without threading."""
while not urlqueue.empty():
url_data = urlqueue.get()
try:
if not url_data.has_result:
url_data.check()
logger.log_url(url_data.to_wire())
check_url(url_data, logger)
finally:
urlqueue.task_done(url_data)
class Checker (task.LoggedCheckedTask):
def check_url(url_data, logger):
"""Check a single URL with logging."""
if url_data.has_result:
logger.log_url(url_data.to_wire())
else:
cache = url_data.aggregate.result_cache
key = url_data.cache_url
result = cache.get_result(key)
if result is None:
# check
check_start = time.time()
try:
url_data.check()
do_parse = url_data.check_content()
url_data.checktime = time.time() - check_start
# Add result to cache
result = url_data.to_wire()
cache.add_result(key, result)
for alias in url_data.aliases:
# redirect aliases
cache.add_result(alias, result)
# parse content recursively
if do_parse:
parser.parse_url(url_data)
finally:
# close/release possible open connection
url_data.close_connection()
else:
# copy data from cache and adjust it
result = copy.copy(result)
result.parent_url = url_data.parent_url
result.base_ref = url_data.base_ref or u""
result.base_url = url_data.base_url or u""
result.line = url_data.line
result.column = url_data.column
result.level = url_data.recursion_level
result.name = url_data.name
logger.log_url(result)
class Checker(task.LoggedCheckedTask):
"""URL check thread."""
def __init__ (self, urlqueue, logger, add_request_session):
@ -71,17 +111,4 @@ class Checker (task.LoggedCheckedTask):
else:
url = url_data.url.encode("ascii", "replace")
self.setName("CheckThread-%s" % url)
if url_data.has_result:
self.logger.log_url(url_data.to_wire())
else:
cache = url_data.aggregate.result_cache
key = url_data.cache_key[1]
result = cache.get_result(key)
if result is None:
url_data.check()
result = url_data.to_wire()
cache.add_result(key, result)
else:
result = copy.copy(result)
result.parent_url = url_data.parent_url
self.logger.log_url(result)
check_url(url_data, self.logger)

View file

@ -55,7 +55,8 @@ class BlacklistLogger (_Logger):
"""
Put invalid url in blacklist, delete valid url from blacklist.
"""
key = url_data.cache_key
key = (url_data.parent_url, url_data.cache_url)
key = repr(key)
if key in self.blacklist:
if url_data.valid:
del self.blacklist[key]
@ -90,7 +91,7 @@ class BlacklistLogger (_Logger):
"""
oldmask = os.umask(0077)
for key, value in self.blacklist.items():
self.write(u"%d %s%s" % (value, key, os.linesep))
self.write(u"%d %s%s" % (value, repr(key), os.linesep))
self.close_fileoutput()
# restore umask
os.umask(oldmask)

View file

@ -67,7 +67,7 @@ class TestLogger (linkcheck.logger._Logger):
url = u"url %s" % url_data.base_url
self.result.append(url)
if self.has_part('cachekey'):
cache_key = url_data.cache_key[1] if url_data.cache_key else None
cache_key = url_data.cache_url if url_data.cache_url else None
self.result.append(u"cache key %s" % cache_key)
if self.has_part('realurl'):
self.result.append(u"real url %s" % url_data.url)

View file

@ -1,2 +0,0 @@
<a href="bl.html">link</a>
<a href="el.html">External link</a>

View file

@ -3,6 +3,11 @@ cache key http://localhost:%(port)d/%(datadir)s/http.html
real url http://localhost:%(port)d/%(datadir)s/http.html
valid
url
cache key http://localhost:%(port)d/%(datadir)s/http.html
real url http://localhost:%(port)d/%(datadir)s/http.html
valid
url dns://www.example.org
cache key dns://www.example.org
real url dns://www.example.org
@ -37,11 +42,6 @@ real url http://localhost:%(port)d/?q=%%C3%%BC
name html entities
valid
url
cache key http://localhost:%(port)d/tests/checker/data/http.html
real url http://localhost:%(port)d/tests/checker/data/http.html
valid
url file.css
cache key http://localhost:%(port)d/tests/checker/data/file.css
real url http://localhost:%(port)d/tests/checker/data/file.css

View file

@ -9,3 +9,8 @@ real url http://www.example.com/
name l1
valid
url HTTP://WWW.EXAMPLE.COM/
cache key http://www.example.com/
real url http://www.example.com/
name l2
valid

View file

@ -39,3 +39,8 @@ real url http://www.example.com/
name ok example
valid
url //www.example.com/
cache key http://www.example.com/
real url http://www.example.com/
name no scheme example
valid

View file

@ -19,6 +19,11 @@ cache key file://%(curdir)s/%(datadir)s/favicon.ico
real url file://%(curdir)s/%(datadir)s/favicon.ico
valid
url favicon.ico
cache key file://%(curdir)s/%(datadir)s/favicon.ico
real url file://%(curdir)s/%(datadir)s/favicon.ico
valid
url test.swf
cache key file://%(curdir)s/%(datadir)s/test.swf
real url file://%(curdir)s/%(datadir)s/test.swf

View file

@ -134,7 +134,7 @@ class TestFile (LinkCheckTest):
self.direct(url, resultlines)
def test_good_dir_space (self):
url = u"file://%(curdir)s/%(datadir)s/a b/bl.html" % self.get_attrs()
url = u"file://%(curdir)s/%(datadir)s/a b/" % self.get_attrs()
nurl = self.norm(url)
url2 = u"file://%(curdir)s/%(datadir)s/a b/el.html" % self.get_attrs()
nurl2 = self.norm(url2)
@ -145,15 +145,15 @@ class TestFile (LinkCheckTest):
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"valid",
u"url bl.html",
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"name link",
u"valid",
u"url el.html",
u"cache key %s" % nurl2,
u"real url %s" % nurl2,
u"name External link",
u"name el.html",
u"valid",
u"url t.txt",
u"cache key %s" % nurl3,
u"real url %s" % nurl3,
u"name t.txt",
u"valid",
u"url t.txt",
u"cache key %s" % nurl3,

View file

@ -58,9 +58,11 @@ class TestHttpRedirect (HttpServerTest):
u"info Redirected to `%s'." % rurl,
u"valid",
u"url newurl.html",
u"cache key %s" % rurl,
u"cache key %s" % nurl,
u"real url %s" % rurl,
u"name Recursive Redirect",
# XXX the info is copied from the cached result
u"info Redirected to `%s'." % rurl,
u"valid",
]
self.direct(url, resultlines, recursionlevel=99)
@ -103,3 +105,9 @@ class TestHttpRedirect (HttpServerTest):
u"error",
]
self.direct(url, resultlines, recursionlevel=99)
def redirect6(self):
#max_redirect = 10
# url = "http://httpbin.org/redirect/" + max_redirect --> valid
# url = "http://httpbin.org/redirect/" + (max_redirect+1) --> error
pass # XXX