Merge pull request #677 from cjmayo/maxrate

Enable average HTTP request rate to be above 4 per second
This commit is contained in:
Chris Mayo 2022-10-17 19:24:49 +01:00 committed by GitHub
commit 96c3336013
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 46 additions and 4 deletions

View file

@ -88,7 +88,11 @@ checking
The default is to queue and check all URLs.
Command line option: none
**maxrequestspersecond=**\ *NUMBER*
Limit the maximum number of requests per second to one host.
Limit the maximum number of HTTP requests per second to one host.
The average number of requests per second is approximately one third of the
maximum. Values less than 1 and at least 0.001 can be used.
To use values greater than 10, the HTTP server must return a
"LinkChecker" response header.
The default is 10.
Command line option: none
**robotstxt=**\ [**0**\ \|\ **1**]

View file

@ -176,6 +176,8 @@ class HttpUrl(internpaturl.InternPatternUrl):
log.debug(LOG_CHECK, "Response headers %s", self.headers)
self.set_encoding(self.url_connection.encoding)
log.debug(LOG_CHECK, "Response encoding %s", self.content_encoding)
if "LinkChecker" in self.headers:
self.aggregate.set_maxrated_for_host(self.urlparts[1])
self._add_ssl_info()
def _add_response_info(self):

View file

@ -86,6 +86,24 @@ class LCConfigParser(RawConfigParser):
if self.has_option(section, option):
self.config[option] = self.getboolean(section, option)
def read_float_option(self, section, option, key=None, min=None, max=None):
"""Read a float option."""
if self.has_option(section, option):
num = self.getfloat(section, option)
if min is not None and num < min:
raise LinkCheckerError(
_("invalid value for %s: %d must not be less than %d")
% (option, num, min)
)
if max is not None and num < max:
raise LinkCheckerError(
_("invalid value for %s: %d must not be greater than %d")
% (option, num, max)
)
if key is None:
key = option
self.config[key] = num
def read_int_option(self, section, option, key=None, min=None, max=None):
"""Read an integer option."""
if self.has_option(section, option):
@ -178,7 +196,7 @@ class LCConfigParser(RawConfigParser):
self.read_int_option(section, "recursionlevel", min=-1)
self.read_string_option(section, "nntpserver")
self.read_string_option(section, "useragent")
self.read_int_option(section, "maxrequestspersecond", min=1)
self.read_float_option(section, "maxrequestspersecond", min=0.001)
self.read_int_option(section, "maxnumurls", min=0)
self.read_int_option(section, "maxfilesizeparse", min=1)
self.read_int_option(section, "maxfilesizedownload", min=1)

View file

@ -52,6 +52,8 @@ def new_request_session(config, cookies):
class Aggregate:
"""Store thread-safe data collections for checker threads."""
wait_time_min_default = 0.1
wait_time_max_default = 0.6
def __init__(self, config, urlqueue, robots_txt, plugin_manager, result_cache):
"""Store given link checking objects."""
@ -64,10 +66,11 @@ class Aggregate:
self.plugin_manager = plugin_manager
self.result_cache = result_cache
self.times = {}
self.maxrated = {}
self.cookies = None
requests_per_second = config["maxrequestspersecond"]
self.wait_time_min = 1.0 / requests_per_second
self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
self.wait_time_max = 6 * self.wait_time_min
self.downloaded_bytes = 0
def visit_loginurl(self):
@ -152,9 +155,22 @@ class Aggregate:
wait = due_time - t
time.sleep(wait)
t = time.time()
wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
if host in self.maxrated:
wait_time_min, wait_time_max = self.wait_time_min, self.wait_time_max
else:
wait_time_min = max(self.wait_time_min, self.wait_time_min_default)
wait_time_max = max(self.wait_time_max, self.wait_time_max_default)
log.debug(LOG_CHECK,
"Min wait time: %s Max wait time: %s for host: %s",
wait_time_min, wait_time_max, host)
wait_time = random.uniform(wait_time_min, wait_time_max)
self.times[host] = t + wait_time
@synchronized(_hosts_lock)
def set_maxrated_for_host(self, host):
"""Remove the limit on the maximum request rate for a host."""
self.maxrated[host] = True
@synchronized(_threads_lock)
def print_active_threads(self):
"""Log all currently active threads."""

View file

@ -12,6 +12,7 @@ debugmemory=1
localwebroot=foo
sslverify=/path/to/cacerts.crt
maxnumurls=1000
maxrequestspersecond=0.1
maxrunseconds=1
maxfilesizeparse=100
maxfilesizedownload=100

View file

@ -55,6 +55,7 @@ class TestConfig(unittest.TestCase):
self.assertEqual(config["localwebroot"], "foo")
self.assertEqual(config["sslverify"], "/path/to/cacerts.crt")
self.assertEqual(config["maxnumurls"], 1000)
self.assertEqual(config["maxrequestspersecond"], 0.1)
self.assertEqual(config["maxrunseconds"], 1)
self.assertEqual(config["maxfilesizeparse"], 100)
self.assertEqual(config["maxfilesizedownload"], 100)