From 57e3b05c886e540296c0b300ab26f86aeb1c42e0 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Wed, 10 Mar 2010 00:00:12 +0100 Subject: [PATCH] limit cache sizes --- doc/changelog.txt | 8 +++- linkcheck/cache/addrinfo.py | 16 +++---- linkcheck/cache/geoip.py | 21 +++++++--- linkcheck/cache/robots_txt.py | 3 +- linkcheck/cache/urlqueue.py | 15 +++---- linkcheck/containers.py | 79 +++++++++++++++++++++++++++++++++-- tests/test_containers.py | 42 +++++++++++++++++++ 7 files changed, 155 insertions(+), 29 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index 57a07e9a..47885055 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -1,10 +1,14 @@ 5.3 "" (released xx.xx.2010) Fixes: -- ftp: fix support for FTP ports other than the default. +- ftp: Fix support for FTP ports other than the default. + +Changes: +- checking: Caches are now size-restricted to limit the memory + usage. Features: -- ftp: detect and support UTF-8 filename encoding capability of FTP +- ftp: Detect and support UTF-8 filename encoding capability of FTP servers. diff --git a/linkcheck/cache/addrinfo.py b/linkcheck/cache/addrinfo.py index 7624c9fc..abbcdf0f 100644 --- a/linkcheck/cache/addrinfo.py +++ b/linkcheck/cache/addrinfo.py @@ -20,21 +20,23 @@ Cache for DNS lookups. import socket import sys from ..lock import get_lock +from ..containers import LFUCache from ..decorators import synchronized _lock = get_lock("addrinfo") -addrinfos = {} +addrinfos = LFUCache(size=10000) @synchronized(_lock) def getaddrinfo (host, port): - key = str(host) + u":" + str(port) - if key not in addrinfos: + key = u"%s:%s" % (unicode(host), unicode(port)) + if key in addrinfos: + value = addrinfos[key] + else: try: - addrinfos[key] = \ - socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + value = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) except socket.error: - addrinfos[key] = sys.exc_info()[1] - value = addrinfos[key] + value = sys.exc_info()[1] + addrinfos[key] = value if isinstance(value, Exception): raise value return value diff --git a/linkcheck/cache/geoip.py b/linkcheck/cache/geoip.py index f812ca91..dbda00ac 100644 --- a/linkcheck/cache/geoip.py +++ b/linkcheck/cache/geoip.py @@ -19,6 +19,7 @@ Store and retrieve country names for IPs. """ import os from ..lock import get_lock +from ..containers import LFUCache from ..decorators import synchronized # I don't know if the geoip library is already thread-safe, but @@ -37,19 +38,27 @@ except ImportError: pass +country_cache = LFUCache(size=1000) + @synchronized(_lock) def get_country (host): - """ - Get translated country name. + """Get translated country name. @return: country string or None """ if geoip is None: + # no geoip available return None - c = geoip.country_code_by_name(host) - if c and c in countries: - return "%s, %s" % (c, countries[c]) - return None + if host in country_cache: + value = country_cache[host] + else: + c = geoip.country_code_by_name(host) + if c and c in countries: + value = "%s, %s" % (c, countries[c]) + else: + value = None + country_cache[host] = value + return value # GeoIP country map with {short name -> translated full name} entries diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index 7cbb09bc..cbf15906 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -18,6 +18,7 @@ Cache robots.txt contents. """ from .. import robotparser2, configuration, url as urlutil +from ..containers import LFUCache from ..decorators import synchronized from ..lock import get_lock @@ -33,7 +34,7 @@ class RobotsTxt (object): """ def __init__ (self): - self.cache = {} + self.cache = LFUCache(size=100) @synchronized(_lock) def allows_url (self, roboturl, url, proxy, user, password, callback=None): diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py index 26ce097c..5452998f 100644 --- a/linkcheck/cache/urlqueue.py +++ b/linkcheck/cache/urlqueue.py @@ -22,6 +22,7 @@ import threading import collections from time import time as _time from .. import log, LOG_CACHE +from ..containers import LFUCache class Timeout (StandardError): @@ -54,23 +55,19 @@ class UrlQueue (object): self.unfinished_tasks = 0 self.finished_tasks = 0 self.in_progress = {} - self.checked = {} + self.checked = LFUCache(size=10000) self.shutdown = False self.unsorted = 0 def qsize (self): """Return the approximate size of the queue (not reliable!).""" - self.mutex.acquire() - n = len(self.queue) - self.mutex.release() - return n + with self.mutex: + return len(self.queue) def empty (self): """Return True if the queue is empty, False otherwise (not reliable!).""" - self.mutex.acquire() - n = self._empty() - self.mutex.release() - return n + with self.mutex: + return self._empty() def _empty (self): return not self.queue diff --git a/linkcheck/containers.py b/linkcheck/containers.py index 93842db6..bcd06ca1 100644 --- a/linkcheck/containers.py +++ b/linkcheck/containers.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -# Copyright (C) 2004-2009 Bastian Kleineidam +# Copyright (C) 2004-2010 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -110,9 +110,6 @@ class ListDict (dict): class CaselessDict (dict): """A dictionary ignoring the case of keys (which must be strings).""" - def __init__ (self): - dict.__init__(self) - def __getitem__ (self, key): assert isinstance(key, basestring) return dict.__getitem__(self, key.lower()) @@ -166,6 +163,80 @@ class CaselessSortedDict (CaselessDict): return ((x, self[x]) for x in self.keys()) +class LFUCache (dict): + """Limited cache which purges least frequently used items.""" + + def __init__ (self, size=1000): + super(LFUCache, self).__init__() + if size < 1: + raise ValueError("invalid cache size %d" % size) + self.size = size + + def __setitem__ (self, key, val): + """Store given key/value.""" + if key in self: + # store value with existing number of uses + num_used = self[key][0] + super(LFUCache, self).__setitem__(key, [num_used, val]) + else: + super(LFUCache, self).__setitem__(key, [0, val]) + # check for size limit + if len(self) > self.size: + self.shrink() + + def shrink (self): + """Shrink ca. 5% of entries.""" + trim = int(0.95*len(self)) + if trim: + items = super(LFUCache, self).items() + values = sorted([(value, key) for key, value in items]) + for value, key in values[0:trim]: + del self[key] + + def __getitem__ (self, key): + value = super(LFUCache, self).__getitem__(key) + value[0] += 1 + return value[1] + + def uses (self, key): + """Get number of uses for given key (without increasing the number of + uses)""" + return super(LFUCache, self).__getitem__(key)[0] + + def get (self, key, def_val=None): + if key in self: + return self[key] + return def_val + + def setdefault (self, key, def_val=None): + if key in self: + return self[key] + self[key] = def_val + return def_val + + def items (self): + return [(key, value[1]) for key, value in super(LFUCache, self).items()] + + def iteritems (self): + for key, value in super(LFUCache, self).iteritems(): + yield (key, value[1]) + + def values (self): + return [value[1] for value in super(LFUCache, self).values()] + + def itervalues (self): + for value in super(LFUCache, self).itervalues(): + yield value[1] + + def popitem (self): + key, value = super(LFUCache, self).popitem() + return (key, value[1]) + + def pop (self): + value = super(LFUCache, self).pop() + return value[1] + + try: from collections import namedtuple except ImportError: diff --git a/tests/test_containers.py b/tests/test_containers.py index e2df08b2..ef06ba82 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -242,6 +242,48 @@ class TestCaselessSortedDict (unittest.TestCase): prev = key +class TestLFUCache (unittest.TestCase): + """ + Test LFU cache implementation. + """ + + def setUp (self): + """ + Set up self.d as empty LFU cache with default size of 1000. + """ + self.size = 1000 + self.d = linkcheck.containers.LFUCache(self.size) + + def test_num_uses (self): + self.assertTrue(not self.d) + self.d["a"] = 1 + self.assertTrue("a" in self.d) + self.assertEqual(self.d.uses("a"), 0) + a = self.d["a"] + self.assertEqual(self.d.uses("a"), 1) + + def test_values (self): + self.assertTrue(not self.d) + self.d["a"] = 1 + self.d["b"] = 2 + self.assertEqual(set([1, 2]), set(self.d.values())) + self.assertEqual(set([1, 2]), set(self.d.itervalues())) + + def test_popitem (self): + self.assertTrue(not self.d) + self.d["a"] = 42 + self.assertEqual(self.d.popitem(), ("a", 42)) + self.assertTrue(not self.d) + self.assertRaises(KeyError, self.d.popitem) + + def test_shrink (self): + self.assertTrue(not self.d) + for i in range(self.size): + self.d[i] = i + self.d[1001] = 1001 + self.assertTrue(len(self.d) <= self.size) + + class TestEnum (unittest.TestCase): def test_enum (self):