From 811f5492c48ee91d0df192e3e57d25c15d0105e5 Mon Sep 17 00:00:00 2001 From: calvin Date: Tue, 16 May 2006 22:56:13 +0000 Subject: [PATCH] fix --pause to delay requests to the same host git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3222 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- TODO | 3 --- linkcheck/cache/connection.py | 20 ++++++++++++++++++-- linkcheck/checker/urlbase.py | 10 +++------- linkcheck/director/__init__.py | 2 +- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/TODO b/TODO index db5bdf0d..ca4dc497 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,3 @@ -- [BUGFIX] The --pause option really meant to pause between two - subsequent requests to the _same_ host. - - [FEATURE] Add robots.txt crawl-delay support in linkchecker - [TEST] Add test for cookie file parsing diff --git a/linkcheck/cache/connection.py b/linkcheck/cache/connection.py index 382e1907..83c4ef0b 100644 --- a/linkcheck/cache/connection.py +++ b/linkcheck/cache/connection.py @@ -19,7 +19,9 @@ Store and retrieve open connections. """ import time +import linkcheck import linkcheck.lock +import linkcheck.log from linkcheck.decorators import synchronized _lock = linkcheck.lock.get_lock("connection") @@ -30,7 +32,7 @@ class ConnectionPool (object): Thread-safe cache, storing a set of connections for URL retrieval. """ - def __init__ (self): + def __init__ (self, wait=0): """ Initialize an empty connection dictionary which will have entries of the form:: @@ -47,6 +49,9 @@ class ConnectionPool (object): # open connections # {(type, host, user, pass) -> [connection, status, expiration time]} self.connections = {} + # {host -> due time} + self.times = {} + self.wait = wait @synchronized(_lock) def add (self, key, conn, timeout): @@ -61,14 +66,25 @@ class ConnectionPool (object): """ Get open connection if available, for at most 30 seconds. + @param key - tuple (type, host, user, pass) @return: Open connection object or None if no connection is available. @rtype None or FTPConnection or HTTP(S)Connection """ + host = key[1] + t = time.time() + if host in self.times: + due_time = self.times[host] + if due_time > t: + wait = due_time - t + assert None == linkcheck.log.debug(linkcheck.LOG_CACHE, + "waiting for %.01f seconds on connection to %s", wait, host) + time.sleep(wait) + t = time.time() + self.times[host] = t + self.wait if key not in self.connections: # not found return None conn_data = self.connections[key] - t = time.time() if t > conn_data[2]: # timed out try: diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index cb82b243..645b0eca 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -381,12 +381,8 @@ class UrlBase (object): """ assert None == linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self) - wait = self.aggregate.config['wait'] - if self.recursion_level and wait: - assert None == linkcheck.log.debug(linkcheck.LOG_CHECK, - "sleeping for %d seconds", wait) - time.sleep(wait) - t = time.time() + # start time for check + check_start = time.time() self.set_extern(self.url) if self.extern[0] and self.extern[1]: self.add_info(_("Outside of domain filter, checked only syntax.")) @@ -423,7 +419,7 @@ class UrlBase (object): self.set_result(linkcheck.strformat.unicode_safe(value), valid=False) - self.checktime = time.time() - t + self.checktime = time.time() - check_start # check recursion try: if self.allows_recursion(): diff --git a/linkcheck/director/__init__.py b/linkcheck/director/__init__.py index 7e4acedb..1ba32549 100644 --- a/linkcheck/director/__init__.py +++ b/linkcheck/director/__init__.py @@ -59,7 +59,7 @@ def check_urls (aggregate): def get_aggregate (config): urlqueue = linkcheck.cache.urlqueue.UrlQueue() - connections = linkcheck.cache.connection.ConnectionPool() + connections = linkcheck.cache.connection.ConnectionPool(wait=config["wait"]) cookies = linkcheck.cache.cookie.CookieJar() robots_txt = linkcheck.cache.robots_txt.RobotsTxt() return aggregator.Aggregate(config, urlqueue, connections,