mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Merge pull request #677 from cjmayo/maxrate
Enable average HTTP request rate to be above 4 per second
This commit is contained in:
commit
96c3336013
6 changed files with 46 additions and 4 deletions
|
|
@ -88,7 +88,11 @@ checking
|
|||
The default is to queue and check all URLs.
|
||||
Command line option: none
|
||||
**maxrequestspersecond=**\ *NUMBER*
|
||||
Limit the maximum number of requests per second to one host.
|
||||
Limit the maximum number of HTTP requests per second to one host.
|
||||
The average number of requests per second is approximately one third of the
|
||||
maximum. Values less than 1 and at least 0.001 can be used.
|
||||
To use values greater than 10, the HTTP server must return a
|
||||
"LinkChecker" response header.
|
||||
The default is 10.
|
||||
Command line option: none
|
||||
**robotstxt=**\ [**0**\ \|\ **1**]
|
||||
|
|
|
|||
|
|
@ -176,6 +176,8 @@ class HttpUrl(internpaturl.InternPatternUrl):
|
|||
log.debug(LOG_CHECK, "Response headers %s", self.headers)
|
||||
self.set_encoding(self.url_connection.encoding)
|
||||
log.debug(LOG_CHECK, "Response encoding %s", self.content_encoding)
|
||||
if "LinkChecker" in self.headers:
|
||||
self.aggregate.set_maxrated_for_host(self.urlparts[1])
|
||||
self._add_ssl_info()
|
||||
|
||||
def _add_response_info(self):
|
||||
|
|
|
|||
|
|
@ -86,6 +86,24 @@ class LCConfigParser(RawConfigParser):
|
|||
if self.has_option(section, option):
|
||||
self.config[option] = self.getboolean(section, option)
|
||||
|
||||
def read_float_option(self, section, option, key=None, min=None, max=None):
|
||||
"""Read a float option."""
|
||||
if self.has_option(section, option):
|
||||
num = self.getfloat(section, option)
|
||||
if min is not None and num < min:
|
||||
raise LinkCheckerError(
|
||||
_("invalid value for %s: %d must not be less than %d")
|
||||
% (option, num, min)
|
||||
)
|
||||
if max is not None and num < max:
|
||||
raise LinkCheckerError(
|
||||
_("invalid value for %s: %d must not be greater than %d")
|
||||
% (option, num, max)
|
||||
)
|
||||
if key is None:
|
||||
key = option
|
||||
self.config[key] = num
|
||||
|
||||
def read_int_option(self, section, option, key=None, min=None, max=None):
|
||||
"""Read an integer option."""
|
||||
if self.has_option(section, option):
|
||||
|
|
@ -178,7 +196,7 @@ class LCConfigParser(RawConfigParser):
|
|||
self.read_int_option(section, "recursionlevel", min=-1)
|
||||
self.read_string_option(section, "nntpserver")
|
||||
self.read_string_option(section, "useragent")
|
||||
self.read_int_option(section, "maxrequestspersecond", min=1)
|
||||
self.read_float_option(section, "maxrequestspersecond", min=0.001)
|
||||
self.read_int_option(section, "maxnumurls", min=0)
|
||||
self.read_int_option(section, "maxfilesizeparse", min=1)
|
||||
self.read_int_option(section, "maxfilesizedownload", min=1)
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ def new_request_session(config, cookies):
|
|||
|
||||
class Aggregate:
|
||||
"""Store thread-safe data collections for checker threads."""
|
||||
wait_time_min_default = 0.1
|
||||
wait_time_max_default = 0.6
|
||||
|
||||
def __init__(self, config, urlqueue, robots_txt, plugin_manager, result_cache):
|
||||
"""Store given link checking objects."""
|
||||
|
|
@ -64,10 +66,11 @@ class Aggregate:
|
|||
self.plugin_manager = plugin_manager
|
||||
self.result_cache = result_cache
|
||||
self.times = {}
|
||||
self.maxrated = {}
|
||||
self.cookies = None
|
||||
requests_per_second = config["maxrequestspersecond"]
|
||||
self.wait_time_min = 1.0 / requests_per_second
|
||||
self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
|
||||
self.wait_time_max = 6 * self.wait_time_min
|
||||
self.downloaded_bytes = 0
|
||||
|
||||
def visit_loginurl(self):
|
||||
|
|
@ -152,9 +155,22 @@ class Aggregate:
|
|||
wait = due_time - t
|
||||
time.sleep(wait)
|
||||
t = time.time()
|
||||
wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
|
||||
if host in self.maxrated:
|
||||
wait_time_min, wait_time_max = self.wait_time_min, self.wait_time_max
|
||||
else:
|
||||
wait_time_min = max(self.wait_time_min, self.wait_time_min_default)
|
||||
wait_time_max = max(self.wait_time_max, self.wait_time_max_default)
|
||||
log.debug(LOG_CHECK,
|
||||
"Min wait time: %s Max wait time: %s for host: %s",
|
||||
wait_time_min, wait_time_max, host)
|
||||
wait_time = random.uniform(wait_time_min, wait_time_max)
|
||||
self.times[host] = t + wait_time
|
||||
|
||||
@synchronized(_hosts_lock)
|
||||
def set_maxrated_for_host(self, host):
|
||||
"""Remove the limit on the maximum request rate for a host."""
|
||||
self.maxrated[host] = True
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
def print_active_threads(self):
|
||||
"""Log all currently active threads."""
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ debugmemory=1
|
|||
localwebroot=foo
|
||||
sslverify=/path/to/cacerts.crt
|
||||
maxnumurls=1000
|
||||
maxrequestspersecond=0.1
|
||||
maxrunseconds=1
|
||||
maxfilesizeparse=100
|
||||
maxfilesizedownload=100
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ class TestConfig(unittest.TestCase):
|
|||
self.assertEqual(config["localwebroot"], "foo")
|
||||
self.assertEqual(config["sslverify"], "/path/to/cacerts.crt")
|
||||
self.assertEqual(config["maxnumurls"], 1000)
|
||||
self.assertEqual(config["maxrequestspersecond"], 0.1)
|
||||
self.assertEqual(config["maxrunseconds"], 1)
|
||||
self.assertEqual(config["maxfilesizeparse"], 100)
|
||||
self.assertEqual(config["maxfilesizedownload"], 100)
|
||||
|
|
|
|||
Loading…
Reference in a new issue