mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-19 22:01:00 +00:00
Use finer-grained robots.txt locks to improve lock contention.
This commit is contained in:
parent
677feab81f
commit
b56c054932
1 changed files with 29 additions and 18 deletions
47
linkcheck/cache/robots_txt.py
vendored
47
linkcheck/cache/robots_txt.py
vendored
|
|
@ -23,8 +23,9 @@ from ..decorators import synchronized
|
|||
from ..lock import get_lock
|
||||
|
||||
|
||||
# lock for caching
|
||||
_lock = get_lock("robots.txt")
|
||||
# lock objects
|
||||
cache_lock = get_lock("robots.txt_cache_lock")
|
||||
robot_lock = get_lock("robots.txt_robot_lock")
|
||||
|
||||
|
||||
class RobotsTxt (object):
|
||||
|
|
@ -32,30 +33,40 @@ class RobotsTxt (object):
|
|||
Thread-safe cache of downloaded robots.txt files.
|
||||
format: {cache key (string) -> robots.txt content (RobotFileParser)}
|
||||
"""
|
||||
useragent = str(configuration.UserAgent)
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize per-URL robots.txt cache."""
|
||||
# mapping {URL -> parsed robots.txt}
|
||||
self.cache = LFUCache(size=100)
|
||||
self.hits = self.misses = 0
|
||||
self.roboturl_locks = {}
|
||||
|
||||
@synchronized(_lock)
|
||||
def allows_url (self, roboturl, url, proxy, user, password, callback=None):
|
||||
"""Ask robots.txt allowance."""
|
||||
useragent = str(configuration.UserAgent)
|
||||
if roboturl in self.cache:
|
||||
self.hits += 1
|
||||
rp = self.cache[roboturl]
|
||||
else:
|
||||
with self.get_lock(roboturl):
|
||||
return self._allows_url(roboturl, url, proxy, user, password, callback)
|
||||
|
||||
def _allows_url (self, roboturl, url, proxy, user, password, callback):
|
||||
with cache_lock:
|
||||
if roboturl in self.cache:
|
||||
self.hits += 1
|
||||
rp = self.cache[roboturl]
|
||||
return rp.can_fetch(self.useragent, url)
|
||||
self.misses += 1
|
||||
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
|
||||
password=password)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
if hasattr(callback, '__call__'):
|
||||
parts = urlutil.url_split(rp.url)
|
||||
host = "%s:%d" % (parts[1], parts[2])
|
||||
wait = rp.get_crawldelay(useragent)
|
||||
callback(host, wait)
|
||||
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
|
||||
password=password)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
if hasattr(callback, '__call__'):
|
||||
parts = urlutil.url_split(rp.url)
|
||||
host = "%s:%d" % (parts[1], parts[2])
|
||||
wait = rp.get_crawldelay(self.useragent)
|
||||
callback(host, wait)
|
||||
with cache_lock:
|
||||
self.cache[roboturl] = rp
|
||||
return rp.can_fetch(useragent, url)
|
||||
return rp.can_fetch(self.useragent, url)
|
||||
|
||||
@synchronized(robot_lock)
|
||||
def get_lock(self, roboturl):
|
||||
return self.roboturl_locks.setdefault(roboturl, get_lock(roboturl))
|
||||
|
|
|
|||
Loading…
Reference in a new issue