From f395c74aac13417f5676c53a91da79e6695bb8c1 Mon Sep 17 00:00:00 2001 From: Paul Haerle Date: Mon, 21 Jun 2021 20:45:19 +0200 Subject: [PATCH] Make ResultCache max_size configurable (#544) * Make ResultCache max_size configurable fixes #463 * Add tests and docs. * fix documentation... ...adapt the source, not the auto-generated man pages themselves as requested in #544. * fix typo. --- config/linkcheckerrc | 2 ++ doc/man/de/linkchecker.1 | 2 +- doc/man/de/linkcheckerrc.5 | 7 ++++++- doc/man/en/linkchecker.1 | 2 +- doc/man/en/linkcheckerrc.5 | 7 ++++++- doc/src/man/linkcheckerrc.rst | 4 ++++ linkcheck/cache/results.py | 4 ++-- linkcheck/configuration/__init__.py | 1 + linkcheck/director/__init__.py | 2 +- tests/cache/test_urlqueue.py | 4 +++- tests/configuration/data/config0.ini | 1 + tests/configuration/test_config.py | 1 + 12 files changed, 29 insertions(+), 8 deletions(-) diff --git a/config/linkcheckerrc b/config/linkcheckerrc index 88735e84..c2470950 100644 --- a/config/linkcheckerrc +++ b/config/linkcheckerrc @@ -173,6 +173,8 @@ #robotstxt=1 # Allowed URL schemes as a comma-separated list. #allowedschemes=http,https +# Size of the result cache. Checking more urls might increase memory usage during runtime +#resultcachesize=100000 ##################### filtering options ########################## [filtering] diff --git a/doc/man/de/linkchecker.1 b/doc/man/de/linkchecker.1 index ce4b158d..a4e88e33 100644 --- a/doc/man/de/linkchecker.1 +++ b/doc/man/de/linkchecker.1 @@ -1,6 +1,6 @@ .\" Man page generated from reStructuredText. . -.TH "LINKCHECKER" "1" "Januar 28, 2021" "2021-01-28" "LinkChecker" +.TH "LINKCHECKER" "1" "Juni 20, 2021" "2021-06-20" "LinkChecker" .SH NAME linkchecker \- Kommandozeilenprogramm zum Prüfen von HTML Dokumenten und Webseiten auf ungültige Verknüpfungen . diff --git a/doc/man/de/linkcheckerrc.5 b/doc/man/de/linkcheckerrc.5 index 9dd4828d..d1561bf0 100644 --- a/doc/man/de/linkcheckerrc.5 +++ b/doc/man/de/linkcheckerrc.5 @@ -1,6 +1,6 @@ .\" Man page generated from reStructuredText. . -.TH "LINKCHECKERRC" "5" "Januar 28, 2021" "2021-01-28" "LinkChecker" +.TH "LINKCHECKERRC" "5" "Juni 20, 2021" "2021-06-20" "LinkChecker" .SH NAME linkcheckerrc \- Konfigurationsdatei für LinkChecker . @@ -104,6 +104,11 @@ Command line option: \fB\-\-no\-robots\fP \fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP\&...] Allowed URL schemes as comma\-separated list. Command line option: none +.TP +\fBresultcachesize=\fP\fINUMBER\fP +Set the result cache size. +The default is 100 000 URLs. +Command line option: none .UNINDENT .SS filtering .INDENT 0.0 diff --git a/doc/man/en/linkchecker.1 b/doc/man/en/linkchecker.1 index 1835672c..3152ead7 100644 --- a/doc/man/en/linkchecker.1 +++ b/doc/man/en/linkchecker.1 @@ -1,6 +1,6 @@ .\" Man page generated from reStructuredText. . -.TH "LINKCHECKER" "1" "January 28, 2021" "2021-01-28" "LinkChecker" +.TH "LINKCHECKER" "1" "June 20, 2021" "2021-06-20" "LinkChecker" .SH NAME linkchecker \- command line client to check HTML documents and websites for broken links . diff --git a/doc/man/en/linkcheckerrc.5 b/doc/man/en/linkcheckerrc.5 index 009cc55e..6f95d786 100644 --- a/doc/man/en/linkcheckerrc.5 +++ b/doc/man/en/linkcheckerrc.5 @@ -1,6 +1,6 @@ .\" Man page generated from reStructuredText. . -.TH "LINKCHECKERRC" "5" "January 28, 2021" "2021-01-28" "LinkChecker" +.TH "LINKCHECKERRC" "5" "June 20, 2021" "2021-06-20" "LinkChecker" .SH NAME linkcheckerrc \- configuration file for LinkChecker . @@ -137,6 +137,11 @@ Command line option: \fB\-\-no\-robots\fP \fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP\&...] Allowed URL schemes as comma\-separated list. Command line option: none +.TP +\fBresultcachesize=\fP\fINUMBER\fP +Set the result cache size. +The default is 100 000 URLs. +Command line option: none .UNINDENT .SS filtering .INDENT 0.0 diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index c8abe9f8..99240891 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -98,6 +98,10 @@ checking **allowedschemes=**\ *NAME*\ [**,**\ *NAME*...] Allowed URL schemes as comma-separated list. Command line option: none +**resultcachesize=**\ *NUMBER* + Set the result cache size. + The default is 100 000 URLs. + Command line option: none filtering ^^^^^^^^^ diff --git a/linkcheck/cache/results.py b/linkcheck/cache/results.py index 90b9462b..253c6198 100644 --- a/linkcheck/cache/results.py +++ b/linkcheck/cache/results.py @@ -32,11 +32,11 @@ class ResultCache: format: {cache key (string) -> result (UrlData.towire())} """ - def __init__(self, max_size=100000): + def __init__(self, result_cache_size): """Initialize result cache.""" # mapping {URL -> cached result} self.cache = {} - self.max_size = max_size + self.max_size = result_cache_size @synchronized(cache_lock) def get_result(self, key): diff --git a/linkcheck/configuration/__init__.py b/linkcheck/configuration/__init__.py index 3a3018ee..4bb8175a 100644 --- a/linkcheck/configuration/__init__.py +++ b/linkcheck/configuration/__init__.py @@ -184,6 +184,7 @@ class Configuration(dict): self["aborttimeout"] = 300 self["recursionlevel"] = -1 self["useragent"] = UserAgent + self["resultcachesize"] = 100000 # authentication self["authentication"] = [] self["loginurl"] = None diff --git a/linkcheck/director/__init__.py b/linkcheck/director/__init__.py index 5f560007..6bed0216 100644 --- a/linkcheck/director/__init__.py +++ b/linkcheck/director/__init__.py @@ -134,7 +134,7 @@ def get_aggregate(config): _urlqueue = urlqueue.UrlQueue(max_allowed_urls=config["maxnumurls"]) _robots_txt = robots_txt.RobotsTxt(config["useragent"]) plugin_manager = plugins.PluginManager(config) - result_cache = results.ResultCache() + result_cache = results.ResultCache(config["resultcachesize"]) return aggregator.Aggregate( config, _urlqueue, _robots_txt, plugin_manager, result_cache ) diff --git a/tests/cache/test_urlqueue.py b/tests/cache/test_urlqueue.py index cf5d69d9..705e547b 100644 --- a/tests/cache/test_urlqueue.py +++ b/tests/cache/test_urlqueue.py @@ -17,6 +17,7 @@ import unittest from collections import namedtuple +import linkcheck.configuration from linkcheck.cache.results import ResultCache from linkcheck.cache.urlqueue import Empty, NUM_PUTS_CLEANUP, UrlQueue @@ -26,7 +27,8 @@ Aggregate = namedtuple("Aggregate", "result_cache") class TestUrlQueue(unittest.TestCase): def setUp(self): - self.result_cache = ResultCache() + config = linkcheck.configuration.Configuration() + self.result_cache = ResultCache(config["resultcachesize"]) self.urlqueue = UrlQueue() self.urldata1 = UrlData( url="Foo", diff --git a/tests/configuration/data/config0.ini b/tests/configuration/data/config0.ini index b8aef02a..b39239d6 100644 --- a/tests/configuration/data/config0.ini +++ b/tests/configuration/data/config0.ini @@ -15,6 +15,7 @@ maxnumurls=1000 maxrunseconds=1 maxfilesizeparse=100 maxfilesizedownload=100 +resultcachesize=100000 [filtering] ignore= diff --git a/tests/configuration/test_config.py b/tests/configuration/test_config.py index b01653aa..d65d3586 100644 --- a/tests/configuration/test_config.py +++ b/tests/configuration/test_config.py @@ -55,6 +55,7 @@ class TestConfig(unittest.TestCase): self.assertEqual(config["maxrunseconds"], 1) self.assertEqual(config["maxfilesizeparse"], 100) self.assertEqual(config["maxfilesizedownload"], 100) + self.assertEqual(config["resultcachesize"], 100000) # filtering section patterns = [x["pattern"].pattern for x in config["externlinks"]] for prefix in ("ignore_", "nofollow_"):