mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-13 00:53:11 +00:00
fix --pause to delay requests to the same host
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3222 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
2383915730
commit
811f5492c4
4 changed files with 22 additions and 13 deletions
3
TODO
3
TODO
|
|
@ -1,6 +1,3 @@
|
|||
- [BUGFIX] The --pause option really meant to pause between two
|
||||
subsequent requests to the _same_ host.
|
||||
|
||||
- [FEATURE] Add robots.txt crawl-delay support in linkchecker
|
||||
|
||||
- [TEST] Add test for cookie file parsing
|
||||
|
|
|
|||
20
linkcheck/cache/connection.py
vendored
20
linkcheck/cache/connection.py
vendored
|
|
@ -19,7 +19,9 @@ Store and retrieve open connections.
|
|||
"""
|
||||
|
||||
import time
|
||||
import linkcheck
|
||||
import linkcheck.lock
|
||||
import linkcheck.log
|
||||
from linkcheck.decorators import synchronized
|
||||
|
||||
_lock = linkcheck.lock.get_lock("connection")
|
||||
|
|
@ -30,7 +32,7 @@ class ConnectionPool (object):
|
|||
Thread-safe cache, storing a set of connections for URL retrieval.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
def __init__ (self, wait=0):
|
||||
"""
|
||||
Initialize an empty connection dictionary which will have entries
|
||||
of the form::
|
||||
|
|
@ -47,6 +49,9 @@ class ConnectionPool (object):
|
|||
# open connections
|
||||
# {(type, host, user, pass) -> [connection, status, expiration time]}
|
||||
self.connections = {}
|
||||
# {host -> due time}
|
||||
self.times = {}
|
||||
self.wait = wait
|
||||
|
||||
@synchronized(_lock)
|
||||
def add (self, key, conn, timeout):
|
||||
|
|
@ -61,14 +66,25 @@ class ConnectionPool (object):
|
|||
"""
|
||||
Get open connection if available, for at most 30 seconds.
|
||||
|
||||
@param key - tuple (type, host, user, pass)
|
||||
@return: Open connection object or None if no connection is available.
|
||||
@rtype None or FTPConnection or HTTP(S)Connection
|
||||
"""
|
||||
host = key[1]
|
||||
t = time.time()
|
||||
if host in self.times:
|
||||
due_time = self.times[host]
|
||||
if due_time > t:
|
||||
wait = due_time - t
|
||||
assert None == linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"waiting for %.01f seconds on connection to %s", wait, host)
|
||||
time.sleep(wait)
|
||||
t = time.time()
|
||||
self.times[host] = t + self.wait
|
||||
if key not in self.connections:
|
||||
# not found
|
||||
return None
|
||||
conn_data = self.connections[key]
|
||||
t = time.time()
|
||||
if t > conn_data[2]:
|
||||
# timed out
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -381,12 +381,8 @@ class UrlBase (object):
|
|||
"""
|
||||
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"Checking %s", self)
|
||||
wait = self.aggregate.config['wait']
|
||||
if self.recursion_level and wait:
|
||||
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"sleeping for %d seconds", wait)
|
||||
time.sleep(wait)
|
||||
t = time.time()
|
||||
# start time for check
|
||||
check_start = time.time()
|
||||
self.set_extern(self.url)
|
||||
if self.extern[0] and self.extern[1]:
|
||||
self.add_info(_("Outside of domain filter, checked only syntax."))
|
||||
|
|
@ -423,7 +419,7 @@ class UrlBase (object):
|
|||
self.set_result(linkcheck.strformat.unicode_safe(value),
|
||||
valid=False)
|
||||
|
||||
self.checktime = time.time() - t
|
||||
self.checktime = time.time() - check_start
|
||||
# check recursion
|
||||
try:
|
||||
if self.allows_recursion():
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ def check_urls (aggregate):
|
|||
|
||||
def get_aggregate (config):
|
||||
urlqueue = linkcheck.cache.urlqueue.UrlQueue()
|
||||
connections = linkcheck.cache.connection.ConnectionPool()
|
||||
connections = linkcheck.cache.connection.ConnectionPool(wait=config["wait"])
|
||||
cookies = linkcheck.cache.cookie.CookieJar()
|
||||
robots_txt = linkcheck.cache.robots_txt.RobotsTxt()
|
||||
return aggregator.Aggregate(config, urlqueue, connections,
|
||||
|
|
|
|||
Loading…
Reference in a new issue