mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-18 19:41:06 +00:00
limit cache sizes
This commit is contained in:
parent
57397e938b
commit
57e3b05c88
7 changed files with 155 additions and 29 deletions
|
|
@ -1,10 +1,14 @@
|
|||
5.3 "" (released xx.xx.2010)
|
||||
|
||||
Fixes:
|
||||
- ftp: fix support for FTP ports other than the default.
|
||||
- ftp: Fix support for FTP ports other than the default.
|
||||
|
||||
Changes:
|
||||
- checking: Caches are now size-restricted to limit the memory
|
||||
usage.
|
||||
|
||||
Features:
|
||||
- ftp: detect and support UTF-8 filename encoding capability of FTP
|
||||
- ftp: Detect and support UTF-8 filename encoding capability of FTP
|
||||
servers.
|
||||
|
||||
|
||||
|
|
|
|||
16
linkcheck/cache/addrinfo.py
vendored
16
linkcheck/cache/addrinfo.py
vendored
|
|
@ -20,21 +20,23 @@ Cache for DNS lookups.
|
|||
import socket
|
||||
import sys
|
||||
from ..lock import get_lock
|
||||
from ..containers import LFUCache
|
||||
from ..decorators import synchronized
|
||||
|
||||
_lock = get_lock("addrinfo")
|
||||
addrinfos = {}
|
||||
addrinfos = LFUCache(size=10000)
|
||||
|
||||
@synchronized(_lock)
|
||||
def getaddrinfo (host, port):
|
||||
key = str(host) + u":" + str(port)
|
||||
if key not in addrinfos:
|
||||
key = u"%s:%s" % (unicode(host), unicode(port))
|
||||
if key in addrinfos:
|
||||
value = addrinfos[key]
|
||||
else:
|
||||
try:
|
||||
addrinfos[key] = \
|
||||
socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
|
||||
value = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
|
||||
except socket.error:
|
||||
addrinfos[key] = sys.exc_info()[1]
|
||||
value = addrinfos[key]
|
||||
value = sys.exc_info()[1]
|
||||
addrinfos[key] = value
|
||||
if isinstance(value, Exception):
|
||||
raise value
|
||||
return value
|
||||
|
|
|
|||
21
linkcheck/cache/geoip.py
vendored
21
linkcheck/cache/geoip.py
vendored
|
|
@ -19,6 +19,7 @@ Store and retrieve country names for IPs.
|
|||
"""
|
||||
import os
|
||||
from ..lock import get_lock
|
||||
from ..containers import LFUCache
|
||||
from ..decorators import synchronized
|
||||
|
||||
# I don't know if the geoip library is already thread-safe, but
|
||||
|
|
@ -37,19 +38,27 @@ except ImportError:
|
|||
pass
|
||||
|
||||
|
||||
country_cache = LFUCache(size=1000)
|
||||
|
||||
@synchronized(_lock)
|
||||
def get_country (host):
|
||||
"""
|
||||
Get translated country name.
|
||||
"""Get translated country name.
|
||||
|
||||
@return: country string or None
|
||||
"""
|
||||
if geoip is None:
|
||||
# no geoip available
|
||||
return None
|
||||
c = geoip.country_code_by_name(host)
|
||||
if c and c in countries:
|
||||
return "%s, %s" % (c, countries[c])
|
||||
return None
|
||||
if host in country_cache:
|
||||
value = country_cache[host]
|
||||
else:
|
||||
c = geoip.country_code_by_name(host)
|
||||
if c and c in countries:
|
||||
value = "%s, %s" % (c, countries[c])
|
||||
else:
|
||||
value = None
|
||||
country_cache[host] = value
|
||||
return value
|
||||
|
||||
|
||||
# GeoIP country map with {short name -> translated full name} entries
|
||||
|
|
|
|||
3
linkcheck/cache/robots_txt.py
vendored
3
linkcheck/cache/robots_txt.py
vendored
|
|
@ -18,6 +18,7 @@
|
|||
Cache robots.txt contents.
|
||||
"""
|
||||
from .. import robotparser2, configuration, url as urlutil
|
||||
from ..containers import LFUCache
|
||||
from ..decorators import synchronized
|
||||
from ..lock import get_lock
|
||||
|
||||
|
|
@ -33,7 +34,7 @@ class RobotsTxt (object):
|
|||
"""
|
||||
|
||||
def __init__ (self):
|
||||
self.cache = {}
|
||||
self.cache = LFUCache(size=100)
|
||||
|
||||
@synchronized(_lock)
|
||||
def allows_url (self, roboturl, url, proxy, user, password, callback=None):
|
||||
|
|
|
|||
15
linkcheck/cache/urlqueue.py
vendored
15
linkcheck/cache/urlqueue.py
vendored
|
|
@ -22,6 +22,7 @@ import threading
|
|||
import collections
|
||||
from time import time as _time
|
||||
from .. import log, LOG_CACHE
|
||||
from ..containers import LFUCache
|
||||
|
||||
|
||||
class Timeout (StandardError):
|
||||
|
|
@ -54,23 +55,19 @@ class UrlQueue (object):
|
|||
self.unfinished_tasks = 0
|
||||
self.finished_tasks = 0
|
||||
self.in_progress = {}
|
||||
self.checked = {}
|
||||
self.checked = LFUCache(size=10000)
|
||||
self.shutdown = False
|
||||
self.unsorted = 0
|
||||
|
||||
def qsize (self):
|
||||
"""Return the approximate size of the queue (not reliable!)."""
|
||||
self.mutex.acquire()
|
||||
n = len(self.queue)
|
||||
self.mutex.release()
|
||||
return n
|
||||
with self.mutex:
|
||||
return len(self.queue)
|
||||
|
||||
def empty (self):
|
||||
"""Return True if the queue is empty, False otherwise (not reliable!)."""
|
||||
self.mutex.acquire()
|
||||
n = self._empty()
|
||||
self.mutex.release()
|
||||
return n
|
||||
with self.mutex:
|
||||
return self._empty()
|
||||
|
||||
def _empty (self):
|
||||
return not self.queue
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2009 Bastian Kleineidam
|
||||
# Copyright (C) 2004-2010 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -110,9 +110,6 @@ class ListDict (dict):
|
|||
class CaselessDict (dict):
|
||||
"""A dictionary ignoring the case of keys (which must be strings)."""
|
||||
|
||||
def __init__ (self):
|
||||
dict.__init__(self)
|
||||
|
||||
def __getitem__ (self, key):
|
||||
assert isinstance(key, basestring)
|
||||
return dict.__getitem__(self, key.lower())
|
||||
|
|
@ -166,6 +163,80 @@ class CaselessSortedDict (CaselessDict):
|
|||
return ((x, self[x]) for x in self.keys())
|
||||
|
||||
|
||||
class LFUCache (dict):
|
||||
"""Limited cache which purges least frequently used items."""
|
||||
|
||||
def __init__ (self, size=1000):
|
||||
super(LFUCache, self).__init__()
|
||||
if size < 1:
|
||||
raise ValueError("invalid cache size %d" % size)
|
||||
self.size = size
|
||||
|
||||
def __setitem__ (self, key, val):
|
||||
"""Store given key/value."""
|
||||
if key in self:
|
||||
# store value with existing number of uses
|
||||
num_used = self[key][0]
|
||||
super(LFUCache, self).__setitem__(key, [num_used, val])
|
||||
else:
|
||||
super(LFUCache, self).__setitem__(key, [0, val])
|
||||
# check for size limit
|
||||
if len(self) > self.size:
|
||||
self.shrink()
|
||||
|
||||
def shrink (self):
|
||||
"""Shrink ca. 5% of entries."""
|
||||
trim = int(0.95*len(self))
|
||||
if trim:
|
||||
items = super(LFUCache, self).items()
|
||||
values = sorted([(value, key) for key, value in items])
|
||||
for value, key in values[0:trim]:
|
||||
del self[key]
|
||||
|
||||
def __getitem__ (self, key):
|
||||
value = super(LFUCache, self).__getitem__(key)
|
||||
value[0] += 1
|
||||
return value[1]
|
||||
|
||||
def uses (self, key):
|
||||
"""Get number of uses for given key (without increasing the number of
|
||||
uses)"""
|
||||
return super(LFUCache, self).__getitem__(key)[0]
|
||||
|
||||
def get (self, key, def_val=None):
|
||||
if key in self:
|
||||
return self[key]
|
||||
return def_val
|
||||
|
||||
def setdefault (self, key, def_val=None):
|
||||
if key in self:
|
||||
return self[key]
|
||||
self[key] = def_val
|
||||
return def_val
|
||||
|
||||
def items (self):
|
||||
return [(key, value[1]) for key, value in super(LFUCache, self).items()]
|
||||
|
||||
def iteritems (self):
|
||||
for key, value in super(LFUCache, self).iteritems():
|
||||
yield (key, value[1])
|
||||
|
||||
def values (self):
|
||||
return [value[1] for value in super(LFUCache, self).values()]
|
||||
|
||||
def itervalues (self):
|
||||
for value in super(LFUCache, self).itervalues():
|
||||
yield value[1]
|
||||
|
||||
def popitem (self):
|
||||
key, value = super(LFUCache, self).popitem()
|
||||
return (key, value[1])
|
||||
|
||||
def pop (self):
|
||||
value = super(LFUCache, self).pop()
|
||||
return value[1]
|
||||
|
||||
|
||||
try:
|
||||
from collections import namedtuple
|
||||
except ImportError:
|
||||
|
|
|
|||
|
|
@ -242,6 +242,48 @@ class TestCaselessSortedDict (unittest.TestCase):
|
|||
prev = key
|
||||
|
||||
|
||||
class TestLFUCache (unittest.TestCase):
|
||||
"""
|
||||
Test LFU cache implementation.
|
||||
"""
|
||||
|
||||
def setUp (self):
|
||||
"""
|
||||
Set up self.d as empty LFU cache with default size of 1000.
|
||||
"""
|
||||
self.size = 1000
|
||||
self.d = linkcheck.containers.LFUCache(self.size)
|
||||
|
||||
def test_num_uses (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 1
|
||||
self.assertTrue("a" in self.d)
|
||||
self.assertEqual(self.d.uses("a"), 0)
|
||||
a = self.d["a"]
|
||||
self.assertEqual(self.d.uses("a"), 1)
|
||||
|
||||
def test_values (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 1
|
||||
self.d["b"] = 2
|
||||
self.assertEqual(set([1, 2]), set(self.d.values()))
|
||||
self.assertEqual(set([1, 2]), set(self.d.itervalues()))
|
||||
|
||||
def test_popitem (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 42
|
||||
self.assertEqual(self.d.popitem(), ("a", 42))
|
||||
self.assertTrue(not self.d)
|
||||
self.assertRaises(KeyError, self.d.popitem)
|
||||
|
||||
def test_shrink (self):
|
||||
self.assertTrue(not self.d)
|
||||
for i in range(self.size):
|
||||
self.d[i] = i
|
||||
self.d[1001] = 1001
|
||||
self.assertTrue(len(self.d) <= self.size)
|
||||
|
||||
|
||||
class TestEnum (unittest.TestCase):
|
||||
|
||||
def test_enum (self):
|
||||
|
|
|
|||
Loading…
Reference in a new issue