Use finer-grained robots.txt locks to improve lock contention.

2026-04-19 22:01:00 +00:00 · 2012-10-01 13:29:29 +02:00 · 2012-10-01 13:29:29 +02:00 · b56c054932
commit b56c054932
parent 677feab81f
1 changed files with 29 additions and 18 deletions
--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -23,8 +23,9 @@ from ..decorators import synchronized
 from ..lock import get_lock


-# lock for caching
-_lock = get_lock("robots.txt")
+# lock objects
+cache_lock = get_lock("robots.txt_cache_lock")
+robot_lock = get_lock("robots.txt_robot_lock")


 class RobotsTxt (object):
@ -32,30 +33,40 @@ class RobotsTxt (object):
    Thread-safe cache of downloaded robots.txt files.
    format: {cache key (string) -> robots.txt content (RobotFileParser)}
    """
+    useragent = str(configuration.UserAgent)

    def __init__ (self):
        """Initialize per-URL robots.txt cache."""
        # mapping {URL -> parsed robots.txt}
        self.cache = LFUCache(size=100)
        self.hits = self.misses = 0
+        self.roboturl_locks = {}

-    @synchronized(_lock)
    def allows_url (self, roboturl, url, proxy, user, password, callback=None):
        """Ask robots.txt allowance."""
-        useragent = str(configuration.UserAgent)
-        if roboturl in self.cache:
-            self.hits += 1
-            rp = self.cache[roboturl]
-        else:
+        with self.get_lock(roboturl):
+            return self._allows_url(roboturl, url, proxy, user, password, callback)
+
+    def _allows_url (self, roboturl, url, proxy, user, password, callback):
+        with cache_lock:
+            if roboturl in self.cache:
+                self.hits += 1
+                rp = self.cache[roboturl]
+                return rp.can_fetch(self.useragent, url)
            self.misses += 1
-            rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
-                password=password)
-            rp.set_url(roboturl)
-            rp.read()
-            if hasattr(callback, '__call__'):
-                parts = urlutil.url_split(rp.url)
-                host = "%s:%d" % (parts[1], parts[2])
-                wait = rp.get_crawldelay(useragent)
-                callback(host, wait)
+        rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
+            password=password)
+        rp.set_url(roboturl)
+        rp.read()
+        if hasattr(callback, '__call__'):
+            parts = urlutil.url_split(rp.url)
+            host = "%s:%d" % (parts[1], parts[2])
+            wait = rp.get_crawldelay(self.useragent)
+            callback(host, wait)
+        with cache_lock:
            self.cache[roboturl] = rp
-        return rp.can_fetch(useragent, url)
+        return rp.can_fetch(self.useragent, url)
+
+    @synchronized(robot_lock)
+    def get_lock(self, roboturl):
+        return self.roboturl_locks.setdefault(roboturl, get_lock(roboturl))