fix --pause to delay requests to the same host

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3222 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2006-05-16 22:56:13 +00:00
parent 2383915730
commit 811f5492c4
4 changed files with 22 additions and 13 deletions

3
TODO
View file

@ -1,6 +1,3 @@
- [BUGFIX] The --pause option really meant to pause between two
subsequent requests to the _same_ host.
- [FEATURE] Add robots.txt crawl-delay support in linkchecker
- [TEST] Add test for cookie file parsing

View file

@ -19,7 +19,9 @@ Store and retrieve open connections.
"""
import time
import linkcheck
import linkcheck.lock
import linkcheck.log
from linkcheck.decorators import synchronized
_lock = linkcheck.lock.get_lock("connection")
@ -30,7 +32,7 @@ class ConnectionPool (object):
Thread-safe cache, storing a set of connections for URL retrieval.
"""
def __init__ (self):
def __init__ (self, wait=0):
"""
Initialize an empty connection dictionary which will have entries
of the form::
@ -47,6 +49,9 @@ class ConnectionPool (object):
# open connections
# {(type, host, user, pass) -> [connection, status, expiration time]}
self.connections = {}
# {host -> due time}
self.times = {}
self.wait = wait
@synchronized(_lock)
def add (self, key, conn, timeout):
@ -61,14 +66,25 @@ class ConnectionPool (object):
"""
Get open connection if available, for at most 30 seconds.
@param key - tuple (type, host, user, pass)
@return: Open connection object or None if no connection is available.
@rtype None or FTPConnection or HTTP(S)Connection
"""
host = key[1]
t = time.time()
if host in self.times:
due_time = self.times[host]
if due_time > t:
wait = due_time - t
assert None == linkcheck.log.debug(linkcheck.LOG_CACHE,
"waiting for %.01f seconds on connection to %s", wait, host)
time.sleep(wait)
t = time.time()
self.times[host] = t + self.wait
if key not in self.connections:
# not found
return None
conn_data = self.connections[key]
t = time.time()
if t > conn_data[2]:
# timed out
try:

View file

@ -381,12 +381,8 @@ class UrlBase (object):
"""
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
"Checking %s", self)
wait = self.aggregate.config['wait']
if self.recursion_level and wait:
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
"sleeping for %d seconds", wait)
time.sleep(wait)
t = time.time()
# start time for check
check_start = time.time()
self.set_extern(self.url)
if self.extern[0] and self.extern[1]:
self.add_info(_("Outside of domain filter, checked only syntax."))
@ -423,7 +419,7 @@ class UrlBase (object):
self.set_result(linkcheck.strformat.unicode_safe(value),
valid=False)
self.checktime = time.time() - t
self.checktime = time.time() - check_start
# check recursion
try:
if self.allows_recursion():

View file

@ -59,7 +59,7 @@ def check_urls (aggregate):
def get_aggregate (config):
urlqueue = linkcheck.cache.urlqueue.UrlQueue()
connections = linkcheck.cache.connection.ConnectionPool()
connections = linkcheck.cache.connection.ConnectionPool(wait=config["wait"])
cookies = linkcheck.cache.cookie.CookieJar()
robots_txt = linkcheck.cache.robots_txt.RobotsTxt()
return aggregator.Aggregate(config, urlqueue, connections,