Make ResultCache max_size configurable (#544)

* Make ResultCache max_size configurable

fixes #463

* Add tests and docs.

* fix documentation...

...adapt the source, not the auto-generated man pages themselves as
requested in #544.

* fix typo.
This commit is contained in:
Paul Haerle 2021-06-21 20:45:19 +02:00 committed by GitHub
parent c94e953dbd
commit f395c74aac
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 29 additions and 8 deletions

View file

@ -173,6 +173,8 @@
#robotstxt=1
# Allowed URL schemes as a comma-separated list.
#allowedschemes=http,https
# Size of the result cache. Checking more urls might increase memory usage during runtime
#resultcachesize=100000
##################### filtering options ##########################
[filtering]

View file

@ -1,6 +1,6 @@
.\" Man page generated from reStructuredText.
.
.TH "LINKCHECKER" "1" "Januar 28, 2021" "2021-01-28" "LinkChecker"
.TH "LINKCHECKER" "1" "Juni 20, 2021" "2021-06-20" "LinkChecker"
.SH NAME
linkchecker \- Kommandozeilenprogramm zum Prüfen von HTML Dokumenten und Webseiten auf ungültige Verknüpfungen
.

View file

@ -1,6 +1,6 @@
.\" Man page generated from reStructuredText.
.
.TH "LINKCHECKERRC" "5" "Januar 28, 2021" "2021-01-28" "LinkChecker"
.TH "LINKCHECKERRC" "5" "Juni 20, 2021" "2021-06-20" "LinkChecker"
.SH NAME
linkcheckerrc \- Konfigurationsdatei für LinkChecker
.
@ -104,6 +104,11 @@ Command line option: \fB\-\-no\-robots\fP
\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP\&...]
Allowed URL schemes as comma\-separated list.
Command line option: none
.TP
\fBresultcachesize=\fP\fINUMBER\fP
Set the result cache size.
The default is 100 000 URLs.
Command line option: none
.UNINDENT
.SS filtering
.INDENT 0.0

View file

@ -1,6 +1,6 @@
.\" Man page generated from reStructuredText.
.
.TH "LINKCHECKER" "1" "January 28, 2021" "2021-01-28" "LinkChecker"
.TH "LINKCHECKER" "1" "June 20, 2021" "2021-06-20" "LinkChecker"
.SH NAME
linkchecker \- command line client to check HTML documents and websites for broken links
.

View file

@ -1,6 +1,6 @@
.\" Man page generated from reStructuredText.
.
.TH "LINKCHECKERRC" "5" "January 28, 2021" "2021-01-28" "LinkChecker"
.TH "LINKCHECKERRC" "5" "June 20, 2021" "2021-06-20" "LinkChecker"
.SH NAME
linkcheckerrc \- configuration file for LinkChecker
.
@ -137,6 +137,11 @@ Command line option: \fB\-\-no\-robots\fP
\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP\&...]
Allowed URL schemes as comma\-separated list.
Command line option: none
.TP
\fBresultcachesize=\fP\fINUMBER\fP
Set the result cache size.
The default is 100 000 URLs.
Command line option: none
.UNINDENT
.SS filtering
.INDENT 0.0

View file

@ -98,6 +98,10 @@ checking
**allowedschemes=**\ *NAME*\ [**,**\ *NAME*...]
Allowed URL schemes as comma-separated list.
Command line option: none
**resultcachesize=**\ *NUMBER*
Set the result cache size.
The default is 100 000 URLs.
Command line option: none
filtering
^^^^^^^^^

View file

@ -32,11 +32,11 @@ class ResultCache:
format: {cache key (string) -> result (UrlData.towire())}
"""
def __init__(self, max_size=100000):
def __init__(self, result_cache_size):
"""Initialize result cache."""
# mapping {URL -> cached result}
self.cache = {}
self.max_size = max_size
self.max_size = result_cache_size
@synchronized(cache_lock)
def get_result(self, key):

View file

@ -184,6 +184,7 @@ class Configuration(dict):
self["aborttimeout"] = 300
self["recursionlevel"] = -1
self["useragent"] = UserAgent
self["resultcachesize"] = 100000
# authentication
self["authentication"] = []
self["loginurl"] = None

View file

@ -134,7 +134,7 @@ def get_aggregate(config):
_urlqueue = urlqueue.UrlQueue(max_allowed_urls=config["maxnumurls"])
_robots_txt = robots_txt.RobotsTxt(config["useragent"])
plugin_manager = plugins.PluginManager(config)
result_cache = results.ResultCache()
result_cache = results.ResultCache(config["resultcachesize"])
return aggregator.Aggregate(
config, _urlqueue, _robots_txt, plugin_manager, result_cache
)

View file

@ -17,6 +17,7 @@
import unittest
from collections import namedtuple
import linkcheck.configuration
from linkcheck.cache.results import ResultCache
from linkcheck.cache.urlqueue import Empty, NUM_PUTS_CLEANUP, UrlQueue
@ -26,7 +27,8 @@ Aggregate = namedtuple("Aggregate", "result_cache")
class TestUrlQueue(unittest.TestCase):
def setUp(self):
self.result_cache = ResultCache()
config = linkcheck.configuration.Configuration()
self.result_cache = ResultCache(config["resultcachesize"])
self.urlqueue = UrlQueue()
self.urldata1 = UrlData(
url="Foo",

View file

@ -15,6 +15,7 @@ maxnumurls=1000
maxrunseconds=1
maxfilesizeparse=100
maxfilesizedownload=100
resultcachesize=100000
[filtering]
ignore=

View file

@ -55,6 +55,7 @@ class TestConfig(unittest.TestCase):
self.assertEqual(config["maxrunseconds"], 1)
self.assertEqual(config["maxfilesizeparse"], 100)
self.assertEqual(config["maxfilesizedownload"], 100)
self.assertEqual(config["resultcachesize"], 100000)
# filtering section
patterns = [x["pattern"].pattern for x in config["externlinks"]]
for prefix in ("ignore_", "nofollow_"):