limit cache sizes

This commit is contained in:
Bastian Kleineidam 2010-03-10 00:00:12 +01:00
parent 57397e938b
commit 57e3b05c88
7 changed files with 155 additions and 29 deletions

View file

@ -1,10 +1,14 @@
5.3 "" (released xx.xx.2010)
Fixes:
- ftp: fix support for FTP ports other than the default.
- ftp: Fix support for FTP ports other than the default.
Changes:
- checking: Caches are now size-restricted to limit the memory
usage.
Features:
- ftp: detect and support UTF-8 filename encoding capability of FTP
- ftp: Detect and support UTF-8 filename encoding capability of FTP
servers.

View file

@ -20,21 +20,23 @@ Cache for DNS lookups.
import socket
import sys
from ..lock import get_lock
from ..containers import LFUCache
from ..decorators import synchronized
_lock = get_lock("addrinfo")
addrinfos = {}
addrinfos = LFUCache(size=10000)
@synchronized(_lock)
def getaddrinfo (host, port):
key = str(host) + u":" + str(port)
if key not in addrinfos:
key = u"%s:%s" % (unicode(host), unicode(port))
if key in addrinfos:
value = addrinfos[key]
else:
try:
addrinfos[key] = \
socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
value = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
except socket.error:
addrinfos[key] = sys.exc_info()[1]
value = addrinfos[key]
value = sys.exc_info()[1]
addrinfos[key] = value
if isinstance(value, Exception):
raise value
return value

View file

@ -19,6 +19,7 @@ Store and retrieve country names for IPs.
"""
import os
from ..lock import get_lock
from ..containers import LFUCache
from ..decorators import synchronized
# I don't know if the geoip library is already thread-safe, but
@ -37,19 +38,27 @@ except ImportError:
pass
country_cache = LFUCache(size=1000)
@synchronized(_lock)
def get_country (host):
"""
Get translated country name.
"""Get translated country name.
@return: country string or None
"""
if geoip is None:
# no geoip available
return None
c = geoip.country_code_by_name(host)
if c and c in countries:
return "%s, %s" % (c, countries[c])
return None
if host in country_cache:
value = country_cache[host]
else:
c = geoip.country_code_by_name(host)
if c and c in countries:
value = "%s, %s" % (c, countries[c])
else:
value = None
country_cache[host] = value
return value
# GeoIP country map with {short name -> translated full name} entries

View file

@ -18,6 +18,7 @@
Cache robots.txt contents.
"""
from .. import robotparser2, configuration, url as urlutil
from ..containers import LFUCache
from ..decorators import synchronized
from ..lock import get_lock
@ -33,7 +34,7 @@ class RobotsTxt (object):
"""
def __init__ (self):
self.cache = {}
self.cache = LFUCache(size=100)
@synchronized(_lock)
def allows_url (self, roboturl, url, proxy, user, password, callback=None):

View file

@ -22,6 +22,7 @@ import threading
import collections
from time import time as _time
from .. import log, LOG_CACHE
from ..containers import LFUCache
class Timeout (StandardError):
@ -54,23 +55,19 @@ class UrlQueue (object):
self.unfinished_tasks = 0
self.finished_tasks = 0
self.in_progress = {}
self.checked = {}
self.checked = LFUCache(size=10000)
self.shutdown = False
self.unsorted = 0
def qsize (self):
"""Return the approximate size of the queue (not reliable!)."""
self.mutex.acquire()
n = len(self.queue)
self.mutex.release()
return n
with self.mutex:
return len(self.queue)
def empty (self):
"""Return True if the queue is empty, False otherwise (not reliable!)."""
self.mutex.acquire()
n = self._empty()
self.mutex.release()
return n
with self.mutex:
return self._empty()
def _empty (self):
return not self.queue

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2009 Bastian Kleineidam
# Copyright (C) 2004-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -110,9 +110,6 @@ class ListDict (dict):
class CaselessDict (dict):
"""A dictionary ignoring the case of keys (which must be strings)."""
def __init__ (self):
dict.__init__(self)
def __getitem__ (self, key):
assert isinstance(key, basestring)
return dict.__getitem__(self, key.lower())
@ -166,6 +163,80 @@ class CaselessSortedDict (CaselessDict):
return ((x, self[x]) for x in self.keys())
class LFUCache (dict):
"""Limited cache which purges least frequently used items."""
def __init__ (self, size=1000):
super(LFUCache, self).__init__()
if size < 1:
raise ValueError("invalid cache size %d" % size)
self.size = size
def __setitem__ (self, key, val):
"""Store given key/value."""
if key in self:
# store value with existing number of uses
num_used = self[key][0]
super(LFUCache, self).__setitem__(key, [num_used, val])
else:
super(LFUCache, self).__setitem__(key, [0, val])
# check for size limit
if len(self) > self.size:
self.shrink()
def shrink (self):
"""Shrink ca. 5% of entries."""
trim = int(0.95*len(self))
if trim:
items = super(LFUCache, self).items()
values = sorted([(value, key) for key, value in items])
for value, key in values[0:trim]:
del self[key]
def __getitem__ (self, key):
value = super(LFUCache, self).__getitem__(key)
value[0] += 1
return value[1]
def uses (self, key):
"""Get number of uses for given key (without increasing the number of
uses)"""
return super(LFUCache, self).__getitem__(key)[0]
def get (self, key, def_val=None):
if key in self:
return self[key]
return def_val
def setdefault (self, key, def_val=None):
if key in self:
return self[key]
self[key] = def_val
return def_val
def items (self):
return [(key, value[1]) for key, value in super(LFUCache, self).items()]
def iteritems (self):
for key, value in super(LFUCache, self).iteritems():
yield (key, value[1])
def values (self):
return [value[1] for value in super(LFUCache, self).values()]
def itervalues (self):
for value in super(LFUCache, self).itervalues():
yield value[1]
def popitem (self):
key, value = super(LFUCache, self).popitem()
return (key, value[1])
def pop (self):
value = super(LFUCache, self).pop()
return value[1]
try:
from collections import namedtuple
except ImportError:

View file

@ -242,6 +242,48 @@ class TestCaselessSortedDict (unittest.TestCase):
prev = key
class TestLFUCache (unittest.TestCase):
"""
Test LFU cache implementation.
"""
def setUp (self):
"""
Set up self.d as empty LFU cache with default size of 1000.
"""
self.size = 1000
self.d = linkcheck.containers.LFUCache(self.size)
def test_num_uses (self):
self.assertTrue(not self.d)
self.d["a"] = 1
self.assertTrue("a" in self.d)
self.assertEqual(self.d.uses("a"), 0)
a = self.d["a"]
self.assertEqual(self.d.uses("a"), 1)
def test_values (self):
self.assertTrue(not self.d)
self.d["a"] = 1
self.d["b"] = 2
self.assertEqual(set([1, 2]), set(self.d.values()))
self.assertEqual(set([1, 2]), set(self.d.itervalues()))
def test_popitem (self):
self.assertTrue(not self.d)
self.d["a"] = 42
self.assertEqual(self.d.popitem(), ("a", 42))
self.assertTrue(not self.d)
self.assertRaises(KeyError, self.d.popitem)
def test_shrink (self):
self.assertTrue(not self.d)
for i in range(self.size):
self.d[i] = i
self.d[1001] = 1001
self.assertTrue(len(self.d) <= self.size)
class TestEnum (unittest.TestCase):
def test_enum (self):