Use finer-grained robots.txt locks to improve lock contention.

This commit is contained in:
Bastian Kleineidam 2012-10-01 13:29:29 +02:00
parent 677feab81f
commit b56c054932

View file

@ -23,8 +23,9 @@ from ..decorators import synchronized
from ..lock import get_lock
# lock for caching
_lock = get_lock("robots.txt")
# lock objects
cache_lock = get_lock("robots.txt_cache_lock")
robot_lock = get_lock("robots.txt_robot_lock")
class RobotsTxt (object):
@ -32,30 +33,40 @@ class RobotsTxt (object):
Thread-safe cache of downloaded robots.txt files.
format: {cache key (string) -> robots.txt content (RobotFileParser)}
"""
useragent = str(configuration.UserAgent)
def __init__ (self):
"""Initialize per-URL robots.txt cache."""
# mapping {URL -> parsed robots.txt}
self.cache = LFUCache(size=100)
self.hits = self.misses = 0
self.roboturl_locks = {}
@synchronized(_lock)
def allows_url (self, roboturl, url, proxy, user, password, callback=None):
"""Ask robots.txt allowance."""
useragent = str(configuration.UserAgent)
if roboturl in self.cache:
self.hits += 1
rp = self.cache[roboturl]
else:
with self.get_lock(roboturl):
return self._allows_url(roboturl, url, proxy, user, password, callback)
def _allows_url (self, roboturl, url, proxy, user, password, callback):
with cache_lock:
if roboturl in self.cache:
self.hits += 1
rp = self.cache[roboturl]
return rp.can_fetch(self.useragent, url)
self.misses += 1
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
password=password)
rp.set_url(roboturl)
rp.read()
if hasattr(callback, '__call__'):
parts = urlutil.url_split(rp.url)
host = "%s:%d" % (parts[1], parts[2])
wait = rp.get_crawldelay(useragent)
callback(host, wait)
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
password=password)
rp.set_url(roboturl)
rp.read()
if hasattr(callback, '__call__'):
parts = urlutil.url_split(rp.url)
host = "%s:%d" % (parts[1], parts[2])
wait = rp.get_crawldelay(self.useragent)
callback(host, wait)
with cache_lock:
self.cache[roboturl] = rp
return rp.can_fetch(useragent, url)
return rp.can_fetch(self.useragent, url)
@synchronized(robot_lock)
def get_lock(self, roboturl):
return self.roboturl_locks.setdefault(roboturl, get_lock(roboturl))