From e88cf49c8f8c1fd2e3f93ec66c0930dd5beda9ba Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 5 Oct 2022 19:28:01 +0100 Subject: [PATCH 1/2] Enable average HTTP request rate to be above 4 per second --- doc/src/man/linkcheckerrc.rst | 5 ++++- linkcheck/checker/httpurl.py | 2 ++ linkcheck/director/aggregator.py | 20 ++++++++++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index 197499bf..838d69f6 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -88,7 +88,10 @@ checking The default is to queue and check all URLs. Command line option: none **maxrequestspersecond=**\ *NUMBER* - Limit the maximum number of requests per second to one host. + Limit the maximum number of HTTP requests per second to one host. + The average number of requests per second is approximately one third of the + maximum. To use values greater than 10, the HTTP server must return a + "LinkChecker" response header. The default is 10. Command line option: none **robotstxt=**\ [**0**\ \|\ **1**] diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 4260672a..edfeebdd 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -176,6 +176,8 @@ class HttpUrl(internpaturl.InternPatternUrl): log.debug(LOG_CHECK, "Response headers %s", self.headers) self.set_encoding(self.url_connection.encoding) log.debug(LOG_CHECK, "Response encoding %s", self.content_encoding) + if "LinkChecker" in self.headers: + self.aggregate.set_maxrated_for_host(self.urlparts[1]) self._add_ssl_info() def _add_response_info(self): diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py index 63d980d4..cace9edf 100644 --- a/linkcheck/director/aggregator.py +++ b/linkcheck/director/aggregator.py @@ -52,6 +52,8 @@ def new_request_session(config, cookies): class Aggregate: """Store thread-safe data collections for checker threads.""" + wait_time_min_default = 0.1 + wait_time_max_default = 0.6 def __init__(self, config, urlqueue, robots_txt, plugin_manager, result_cache): """Store given link checking objects.""" @@ -64,10 +66,11 @@ class Aggregate: self.plugin_manager = plugin_manager self.result_cache = result_cache self.times = {} + self.maxrated = {} self.cookies = None requests_per_second = config["maxrequestspersecond"] self.wait_time_min = 1.0 / requests_per_second - self.wait_time_max = max(self.wait_time_min + 0.5, 0.5) + self.wait_time_max = 6 * self.wait_time_min self.downloaded_bytes = 0 def visit_loginurl(self): @@ -152,9 +155,22 @@ class Aggregate: wait = due_time - t time.sleep(wait) t = time.time() - wait_time = random.uniform(self.wait_time_min, self.wait_time_max) + if host in self.maxrated: + wait_time_min, wait_time_max = self.wait_time_min, self.wait_time_max + else: + wait_time_min = max(self.wait_time_min, self.wait_time_min_default) + wait_time_max = max(self.wait_time_max, self.wait_time_max_default) + log.debug(LOG_CHECK, + "Min wait time: %s Max wait time: %s for host: %s", + wait_time_min, wait_time_max, host) + wait_time = random.uniform(wait_time_min, wait_time_max) self.times[host] = t + wait_time + @synchronized(_hosts_lock) + def set_maxrated_for_host(self, host): + """Remove the limit on the maximum request rate for a host.""" + self.maxrated[host] = True + @synchronized(_threads_lock) def print_active_threads(self): """Log all currently active threads.""" From 0c5db040c8c3fdf915442d67c47c74d7c83212e6 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 5 Oct 2022 19:28:01 +0100 Subject: [PATCH 2/2] Support maxrequestspersecond less than one --- doc/src/man/linkcheckerrc.rst | 3 ++- linkcheck/configuration/confparse.py | 20 +++++++++++++++++++- tests/configuration/data/config0.ini | 1 + tests/configuration/test_config.py | 1 + 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index 838d69f6..cab02d28 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -90,7 +90,8 @@ checking **maxrequestspersecond=**\ *NUMBER* Limit the maximum number of HTTP requests per second to one host. The average number of requests per second is approximately one third of the - maximum. To use values greater than 10, the HTTP server must return a + maximum. Values less than 1 and at least 0.001 can be used. + To use values greater than 10, the HTTP server must return a "LinkChecker" response header. The default is 10. Command line option: none diff --git a/linkcheck/configuration/confparse.py b/linkcheck/configuration/confparse.py index da772326..2d64c75a 100644 --- a/linkcheck/configuration/confparse.py +++ b/linkcheck/configuration/confparse.py @@ -86,6 +86,24 @@ class LCConfigParser(RawConfigParser): if self.has_option(section, option): self.config[option] = self.getboolean(section, option) + def read_float_option(self, section, option, key=None, min=None, max=None): + """Read a float option.""" + if self.has_option(section, option): + num = self.getfloat(section, option) + if min is not None and num < min: + raise LinkCheckerError( + _("invalid value for %s: %d must not be less than %d") + % (option, num, min) + ) + if max is not None and num < max: + raise LinkCheckerError( + _("invalid value for %s: %d must not be greater than %d") + % (option, num, max) + ) + if key is None: + key = option + self.config[key] = num + def read_int_option(self, section, option, key=None, min=None, max=None): """Read an integer option.""" if self.has_option(section, option): @@ -178,7 +196,7 @@ class LCConfigParser(RawConfigParser): self.read_int_option(section, "recursionlevel", min=-1) self.read_string_option(section, "nntpserver") self.read_string_option(section, "useragent") - self.read_int_option(section, "maxrequestspersecond", min=1) + self.read_float_option(section, "maxrequestspersecond", min=0.001) self.read_int_option(section, "maxnumurls", min=0) self.read_int_option(section, "maxfilesizeparse", min=1) self.read_int_option(section, "maxfilesizedownload", min=1) diff --git a/tests/configuration/data/config0.ini b/tests/configuration/data/config0.ini index 7116a0c6..293e10ab 100644 --- a/tests/configuration/data/config0.ini +++ b/tests/configuration/data/config0.ini @@ -12,6 +12,7 @@ debugmemory=1 localwebroot=foo sslverify=/path/to/cacerts.crt maxnumurls=1000 +maxrequestspersecond=0.1 maxrunseconds=1 maxfilesizeparse=100 maxfilesizedownload=100 diff --git a/tests/configuration/test_config.py b/tests/configuration/test_config.py index 75fcc24f..a4da4dcd 100644 --- a/tests/configuration/test_config.py +++ b/tests/configuration/test_config.py @@ -55,6 +55,7 @@ class TestConfig(unittest.TestCase): self.assertEqual(config["localwebroot"], "foo") self.assertEqual(config["sslverify"], "/path/to/cacerts.crt") self.assertEqual(config["maxnumurls"], 1000) + self.assertEqual(config["maxrequestspersecond"], 0.1) self.assertEqual(config["maxrunseconds"], 1) self.assertEqual(config["maxfilesizeparse"], 100) self.assertEqual(config["maxfilesizedownload"], 100)