linkchecker/linkcheck/configuration/confparse.py
nodet 28f6743778
Add ignorewarningsforurls to ignore specific warnings (#794)
We want to allow specifying a warning to ignore for
each URL. If no regex is specified for the warning to ignore,
we'll ignore all warnings.

The tests still pass as they are, which means that unknown
values in the configuration file are simply ignored.

* [#782] Add values to configuration file

* [#782] Parse new configuration values

* [#782] Actually ignore a warning

* [#782] Confirm side cases work as expected

* [#782] Add logging when deciding to ignore warnings

* [#782] Documentation for ignorewarningsforurls

* [#782] Update (generated) man pages

* [#782] These tests pass without network, actually

* [#782] Fix copy/paste error in symbol naming

* [#782] The regex matches the name of the warning, not the message

* [#782] Better wording

* [#782] Update (generated) man pages

* [#782] We match the type, not the message
2024-02-13 19:43:29 +00:00

333 lines
14 KiB
Python

# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""Parse configuration files"""
from configparser import RawConfigParser
from re import compile as re_compile
import os
from .. import (
LinkCheckerError,
get_link_pat,
LOG_CHECK,
log,
fileutil,
plugins,
logconf,
)
def read_multiline(value):
"""Helper function reading multiline values."""
for line in value.splitlines():
line = line.strip()
if not line or line.startswith('#'):
continue
yield line
class LCConfigParser(RawConfigParser):
"""
Parse a LinkChecker configuration file.
"""
def __init__(self, config):
"""Initialize configuration."""
super().__init__()
self.config = config
def read(self, files):
"""Read settings from given config files.
@raises: LinkCheckerError on syntax errors in the config file(s)
"""
assert isinstance(files, list), "Invalid file list %r" % files
try:
self.read_ok = super().read(files)
if not self.sections():
raise LinkCheckerError(
_("configuration files %s contain no sections.") % files)
if len(self.read_ok) < len(files):
failed_files = set(files) - set(self.read_ok)
log.warn(
LOG_CHECK, "Could not read configuration files %s.", failed_files
)
# Read all the configuration parameters from the given files.
self.read_checking_config()
self.read_authentication_config()
self.read_filtering_config()
self.read_output_config()
self.read_plugin_config()
except Exception as msg:
raise LinkCheckerError(_("Error parsing configuration: %s") % str(msg))
def read_string_option(self, section, option, allowempty=False):
"""Read a string option."""
if self.has_option(section, option):
value = self.get(section, option)
if not allowempty and not value:
raise LinkCheckerError(
_("invalid empty value for %s: %s\n") % (option, value)
)
self.config[option] = value
def read_boolean_option(self, section, option):
"""Read a boolean option."""
if self.has_option(section, option):
self.config[option] = self.getboolean(section, option)
def read_float_option(self, section, option, key=None, min=None, max=None):
"""Read a float option."""
if self.has_option(section, option):
num = self.getfloat(section, option)
if min is not None and num < min:
raise LinkCheckerError(
_("invalid value for %s: %d must not be less than %d")
% (option, num, min)
)
if max is not None and num < max:
raise LinkCheckerError(
_("invalid value for %s: %d must not be greater than %d")
% (option, num, max)
)
if key is None:
key = option
self.config[key] = num
def read_int_option(self, section, option, key=None, min=None, max=None):
"""Read an integer option."""
if self.has_option(section, option):
num = self.getint(section, option)
if min is not None and num < min:
raise LinkCheckerError(
_("invalid value for %s: %d must not be less than %d")
% (option, num, min)
)
if max is not None and num < max:
raise LinkCheckerError(
_("invalid value for %s: %d must not be greater than %d")
% (option, num, max)
)
if key is None:
key = option
self.config[key] = num
def read_output_config(self):
"""Read configuration options in section "output"."""
section = "output"
from ..logger import LoggerClasses
if self.has_section("blacklist"):
log.warn(
LOG_CHECK,
_("The blacklist section in linkcheckerrc is deprecated, "
"please rename to failures")
)
for opt in self.options("blacklist"):
self.config["failures"][opt] = self.get("blacklist", opt)
for c in LoggerClasses:
key = c.LoggerName
if self.has_section(key):
for opt in self.options(key):
self.config[key][opt] = self.get(key, opt)
if self.has_option(key, 'parts'):
val = self.get(key, 'parts')
parts = [f.strip().lower() for f in val.split(',')]
self.config[key]['parts'] = parts
self.read_boolean_option(section, "warnings")
if self.has_option(section, "verbose"):
if self.getboolean(section, "verbose"):
self.config["verbose"] = True
self.config["warnings"] = True
if self.has_option(section, "quiet"):
if self.getboolean(section, "quiet"):
self.config['output'] = 'none'
self.config['quiet'] = True
logconf.reset_loglevel() # if debug will be overwritten next
if self.has_option(section, "debug"):
val = self.get(section, "debug")
parts = [f.strip().lower() for f in val.split(',')]
logconf.set_debug(parts)
self.read_boolean_option(section, "status")
if self.has_option(section, "log"):
val = self.get(section, "log").strip().lower()
self.config['output'] = val
if self.has_option(section, "fileoutput"):
loggers = self.get(section, "fileoutput").split(",")
# strip names from whitespace
loggers = (x.strip().lower() for x in loggers)
# no file output for the failures and none Logger
from ..logger import LoggerNames
loggers = (
x
for x in loggers
if x in LoggerNames and x not in ("failures", "none")
)
for val in loggers:
output = self.config.logger_new(val, fileoutput=1)
self.config['fileoutput'].append(output)
if self.has_option(section, "ignoreerrors"):
for line in read_multiline(self.get(section, "ignoreerrors")):
parts = line.split(maxsplit=1)
if len(parts) == 1:
parts.append('')
self.config["ignoreerrors"].append(tuple(
re_compile(part) for part in parts
))
def read_checking_config(self):
"""Read configuration options in section "checking"."""
section = "checking"
self.read_int_option(section, "threads", min=-1)
self.config['threads'] = max(0, self.config['threads'])
self.read_int_option(section, "timeout", min=1)
self.read_int_option(section, "aborttimeout", min=1)
self.read_int_option(section, "recursionlevel", min=-1)
self.read_string_option(section, "useragent")
self.read_float_option(section, "maxrequestspersecond", min=0.001)
self.read_int_option(section, "maxnumurls", min=0)
self.read_int_option(section, "maxfilesizeparse", min=1)
self.read_int_option(section, "maxfilesizedownload", min=1)
if self.has_option(section, "allowedschemes"):
self.config['allowedschemes'] = [
x.strip().lower()
for x in self.get(section, 'allowedschemes').split(',')
]
self.read_boolean_option(section, "debugmemory")
self.read_string_option(section, "cookiefile")
self.read_boolean_option(section, "robotstxt")
self.read_string_option(section, "localwebroot")
try:
self.read_boolean_option(section, "sslverify")
except ValueError:
self.read_string_option(section, "sslverify")
self.read_int_option(section, "maxrunseconds", min=0)
self.read_int_option(section, "resultcachesize", min=0)
def read_authentication_config(self):
"""Read configuration options in section "authentication"."""
section = "authentication"
password_fields = []
if self.has_option(section, "entry"):
for val in read_multiline(self.get(section, "entry")):
auth = val.split()
if len(auth) == 3:
self.config.add_auth(
pattern=auth[0], user=auth[1], password=auth[2]
)
password_fields.append(f"entry/{auth[0]}/{auth[1]}")
elif len(auth) == 2:
self.config.add_auth(pattern=auth[0], user=auth[1])
else:
raise LinkCheckerError(
_("missing auth part in entry %(val)r") % {"val": val}
)
# read login URL and field names
if self.has_option(section, "loginurl"):
val = self.get(section, "loginurl").strip()
if not (
val.lower().startswith("http:") or val.lower().startswith("https:")
):
raise LinkCheckerError(
_(
"invalid login URL `%s'. Only "
"HTTP and HTTPS URLs are supported."
)
% val
)
self.config["loginurl"] = val
self.read_string_option(section, "loginuserfield")
self.read_string_option(section, "loginpasswordfield")
# read login extra fields
if self.has_option(section, "loginextrafields"):
for val in read_multiline(self.get(section, "loginextrafields")):
name, value = val.split(":", 1)
self.config["loginextrafields"][name] = value
self.check_password_readable(section, password_fields)
def check_password_readable(self, section, fields):
"""Check if there is a readable configuration file and print a warning."""
if not fields:
return
# The information which of the configuration files
# included which option is not available. To avoid false positives,
# a warning is only printed if exactly one file has been read.
if len(self.read_ok) != 1:
return
fn = self.read_ok[0]
if fileutil.is_accessable_by_others(fn):
log.warn(
LOG_CHECK,
_(
"The configuration file %s contains password information (in"
" section [%s] and options %s) and the file is readable by"
" others. Please make the file only readable by you."
),
fn,
section,
fields,
)
if os.name == 'posix':
log.warn(LOG_CHECK, _("For example execute 'chmod go-rw %s'.") % fn)
elif os.name == 'nt':
log.warn(
LOG_CHECK,
_(
"See %(url)s for more info on setting file permissions."
) % {"url": "https://support.microsoft.com/kb/308419"}
)
def read_filtering_config(self):
"""
Read configuration options in section "filtering".
"""
section = "filtering"
if self.has_option(section, "ignorewarnings"):
self.config['ignorewarnings'] = [
f.strip().lower()
for f in self.get(section, 'ignorewarnings').split(',')
]
if self.has_option(section, "ignorewarningsforurls"):
for line in read_multiline(self.get(section, "ignorewarningsforurls")):
parts = line.split(maxsplit=1)
if len(parts) == 1:
parts.append('')
self.config["ignorewarningsforurls"].append(tuple(
re_compile(part) for part in parts
))
if self.has_option(section, "ignore"):
for line in read_multiline(self.get(section, "ignore")):
pat = get_link_pat(line, strict=1)
self.config["externlinks"].append(pat)
if self.has_option(section, "nofollow"):
for line in read_multiline(self.get(section, "nofollow")):
pat = get_link_pat(line, strict=0)
self.config["externlinks"].append(pat)
if self.has_option(section, "internlinks"):
pat = get_link_pat(self.get(section, "internlinks"))
self.config["internlinks"].append(pat)
self.read_boolean_option(section, "checkextern")
def read_plugin_config(self):
"""Read plugin-specific configuration values."""
folders = self.config["pluginfolders"]
modules = plugins.get_plugin_modules(folders)
for pluginclass in plugins.get_plugin_classes(modules):
section = pluginclass.__name__
if self.has_section(section):
self.config["enabledplugins"].append(section)
self.config[section] = pluginclass.read_config(self)