2014-01-08 21:33:04 +00:00
|
|
|
# Copyright (C) 2000-2014 Bastian Kleineidam
|
2006-05-03 18:24:46 +00:00
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
2009-07-24 21:58:20 +00:00
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
2006-05-03 18:24:46 +00:00
|
|
|
"""Parse configuration files"""
|
|
|
|
|
|
2020-05-15 18:37:04 +00:00
|
|
|
from configparser import RawConfigParser
|
2022-07-21 15:32:27 +00:00
|
|
|
from re import compile as re_compile
|
2012-10-15 12:36:10 +00:00
|
|
|
import os
|
2020-05-15 18:37:04 +00:00
|
|
|
|
2020-05-30 16:01:36 +00:00
|
|
|
from .. import (
|
|
|
|
|
LinkCheckerError,
|
|
|
|
|
get_link_pat,
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
log,
|
|
|
|
|
fileutil,
|
|
|
|
|
plugins,
|
|
|
|
|
logconf,
|
|
|
|
|
)
|
2006-05-03 18:24:46 +00:00
|
|
|
|
2007-12-01 15:50:33 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read_multiline(value):
|
2007-12-01 15:50:33 +00:00
|
|
|
"""Helper function reading multiline values."""
|
|
|
|
|
for line in value.splitlines():
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if not line or line.startswith('#'):
|
|
|
|
|
continue
|
|
|
|
|
yield line
|
|
|
|
|
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
class LCConfigParser(RawConfigParser):
|
2006-05-03 18:24:46 +00:00
|
|
|
"""
|
|
|
|
|
Parse a LinkChecker configuration file.
|
|
|
|
|
"""
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def __init__(self, config):
|
2011-02-14 20:06:34 +00:00
|
|
|
"""Initialize configuration."""
|
2020-06-03 19:06:36 +00:00
|
|
|
super().__init__()
|
2006-05-03 18:24:46 +00:00
|
|
|
self.config = config
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read(self, files):
|
2007-12-01 15:50:33 +00:00
|
|
|
"""Read settings from given config files.
|
2006-05-03 18:24:46 +00:00
|
|
|
|
|
|
|
|
@raises: LinkCheckerError on syntax errors in the config file(s)
|
|
|
|
|
"""
|
2012-10-15 12:36:10 +00:00
|
|
|
assert isinstance(files, list), "Invalid file list %r" % files
|
2006-05-03 18:24:46 +00:00
|
|
|
try:
|
2020-06-03 19:06:36 +00:00
|
|
|
self.read_ok = super().read(files)
|
2012-10-15 12:36:10 +00:00
|
|
|
if len(self.read_ok) < len(files):
|
|
|
|
|
failed_files = set(files) - set(self.read_ok)
|
2020-05-30 16:01:36 +00:00
|
|
|
log.warn(
|
|
|
|
|
LOG_CHECK, "Could not read configuration files %s.", failed_files
|
|
|
|
|
)
|
2007-12-01 15:50:33 +00:00
|
|
|
# Read all the configuration parameters from the given files.
|
|
|
|
|
self.read_checking_config()
|
|
|
|
|
self.read_authentication_config()
|
|
|
|
|
self.read_filtering_config()
|
2014-02-28 23:12:34 +00:00
|
|
|
self.read_output_config()
|
|
|
|
|
self.read_plugin_config()
|
2012-11-26 17:49:07 +00:00
|
|
|
except Exception as msg:
|
2020-05-30 16:01:36 +00:00
|
|
|
raise LinkCheckerError(_("Error parsing configuration: %s") % str(msg))
|
2006-05-03 18:24:46 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read_string_option(self, section, option, allowempty=False):
|
2014-02-28 23:12:34 +00:00
|
|
|
"""Read a string option."""
|
2012-08-23 02:52:25 +00:00
|
|
|
if self.has_option(section, option):
|
2012-09-21 14:05:34 +00:00
|
|
|
value = self.get(section, option)
|
|
|
|
|
if not allowempty and not value:
|
2020-05-30 16:01:36 +00:00
|
|
|
raise LinkCheckerError(
|
|
|
|
|
_("invalid empty value for %s: %s\n") % (option, value)
|
|
|
|
|
)
|
2012-09-21 14:05:34 +00:00
|
|
|
self.config[option] = value
|
2012-08-23 02:52:25 +00:00
|
|
|
|
2012-06-10 09:22:54 +00:00
|
|
|
def read_boolean_option(self, section, option):
|
|
|
|
|
"""Read a boolean option."""
|
|
|
|
|
if self.has_option(section, option):
|
|
|
|
|
self.config[option] = self.getboolean(section, option)
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read_int_option(self, section, option, key=None, min=None, max=None):
|
2012-06-20 18:10:40 +00:00
|
|
|
"""Read an integer option."""
|
|
|
|
|
if self.has_option(section, option):
|
|
|
|
|
num = self.getint(section, option)
|
2012-10-10 08:53:52 +00:00
|
|
|
if min is not None and num < min:
|
2012-06-20 18:10:40 +00:00
|
|
|
raise LinkCheckerError(
|
2020-05-30 16:01:36 +00:00
|
|
|
_("invalid value for %s: %d must not be less than %d")
|
|
|
|
|
% (option, num, min)
|
|
|
|
|
)
|
2012-10-10 08:53:52 +00:00
|
|
|
if max is not None and num < max:
|
|
|
|
|
raise LinkCheckerError(
|
2020-05-30 16:01:36 +00:00
|
|
|
_("invalid value for %s: %d must not be greater than %d")
|
|
|
|
|
% (option, num, max)
|
|
|
|
|
)
|
2012-06-20 18:10:40 +00:00
|
|
|
if key is None:
|
|
|
|
|
key = option
|
|
|
|
|
self.config[key] = num
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read_output_config(self):
|
2007-12-01 15:50:33 +00:00
|
|
|
"""Read configuration options in section "output"."""
|
2006-05-03 18:24:46 +00:00
|
|
|
section = "output"
|
2013-12-11 17:41:55 +00:00
|
|
|
from ..logger import LoggerClasses
|
2020-05-30 16:01:36 +00:00
|
|
|
|
2020-08-23 16:19:26 +00:00
|
|
|
if self.has_section("blacklist"):
|
|
|
|
|
log.warn(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
_("The blacklist section in linkcheckerrc is deprecated, "
|
|
|
|
|
"please rename to failures")
|
|
|
|
|
)
|
|
|
|
|
for opt in self.options("blacklist"):
|
|
|
|
|
self.config["failures"][opt] = self.get("blacklist", opt)
|
2013-12-11 17:41:55 +00:00
|
|
|
for c in LoggerClasses:
|
|
|
|
|
key = c.LoggerName
|
2006-05-03 18:24:46 +00:00
|
|
|
if self.has_section(key):
|
|
|
|
|
for opt in self.options(key):
|
2007-12-01 15:50:33 +00:00
|
|
|
self.config[key][opt] = self.get(key, opt)
|
|
|
|
|
if self.has_option(key, 'parts'):
|
|
|
|
|
val = self.get(key, 'parts')
|
2011-10-10 18:32:58 +00:00
|
|
|
parts = [f.strip().lower() for f in val.split(',')]
|
2007-12-01 15:50:33 +00:00
|
|
|
self.config[key]['parts'] = parts
|
2012-06-10 09:22:54 +00:00
|
|
|
self.read_boolean_option(section, "warnings")
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "verbose"):
|
2006-05-03 18:24:46 +00:00
|
|
|
if self.getboolean(section, "verbose"):
|
|
|
|
|
self.config["verbose"] = True
|
|
|
|
|
self.config["warnings"] = True
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "quiet"):
|
2006-05-03 18:24:46 +00:00
|
|
|
if self.getboolean(section, "quiet"):
|
2007-12-01 15:50:33 +00:00
|
|
|
self.config['output'] = 'none'
|
2006-05-13 09:53:53 +00:00
|
|
|
self.config['quiet'] = True
|
2021-11-29 19:48:50 +00:00
|
|
|
logconf.reset_loglevel() # if debug will be overwritten next
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "debug"):
|
|
|
|
|
val = self.get(section, "debug")
|
2011-10-10 18:32:58 +00:00
|
|
|
parts = [f.strip().lower() for f in val.split(',')]
|
2014-05-10 19:23:06 +00:00
|
|
|
logconf.set_debug(parts)
|
2012-06-10 09:22:54 +00:00
|
|
|
self.read_boolean_option(section, "status")
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "log"):
|
2011-10-10 18:32:58 +00:00
|
|
|
val = self.get(section, "log").strip().lower()
|
2007-12-01 15:50:33 +00:00
|
|
|
self.config['output'] = val
|
|
|
|
|
if self.has_option(section, "fileoutput"):
|
2009-06-18 18:29:31 +00:00
|
|
|
loggers = self.get(section, "fileoutput").split(",")
|
|
|
|
|
# strip names from whitespace
|
2011-10-10 18:32:58 +00:00
|
|
|
loggers = (x.strip().lower() for x in loggers)
|
2020-08-23 16:19:26 +00:00
|
|
|
# no file output for the failures and none Logger
|
2013-12-11 17:41:55 +00:00
|
|
|
from ..logger import LoggerNames
|
2020-05-30 16:01:36 +00:00
|
|
|
|
|
|
|
|
loggers = (
|
|
|
|
|
x
|
|
|
|
|
for x in loggers
|
2020-08-23 16:19:26 +00:00
|
|
|
if x in LoggerNames and x not in ("failures", "none")
|
2020-05-30 16:01:36 +00:00
|
|
|
)
|
2009-06-18 18:29:31 +00:00
|
|
|
for val in loggers:
|
|
|
|
|
output = self.config.logger_new(val, fileoutput=1)
|
|
|
|
|
self.config['fileoutput'].append(output)
|
2022-07-21 15:32:27 +00:00
|
|
|
if self.has_option(section, "ignoreerrors"):
|
|
|
|
|
for line in read_multiline(self.get(section, "ignoreerrors")):
|
|
|
|
|
parts = line.split(maxsplit=1)
|
|
|
|
|
if len(parts) == 1:
|
|
|
|
|
parts.append('')
|
|
|
|
|
self.config["ignoreerrors"].append(tuple(
|
|
|
|
|
re_compile(part) for part in parts
|
|
|
|
|
))
|
2006-05-03 18:24:46 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read_checking_config(self):
|
2007-12-01 15:50:33 +00:00
|
|
|
"""Read configuration options in section "checking"."""
|
2006-05-03 18:24:46 +00:00
|
|
|
section = "checking"
|
2012-10-10 08:53:52 +00:00
|
|
|
self.read_int_option(section, "threads", min=-1)
|
2012-06-20 18:11:13 +00:00
|
|
|
self.config['threads'] = max(0, self.config['threads'])
|
2012-10-10 08:53:52 +00:00
|
|
|
self.read_int_option(section, "timeout", min=1)
|
2014-02-28 23:12:34 +00:00
|
|
|
self.read_int_option(section, "aborttimeout", min=1)
|
2012-10-10 08:53:52 +00:00
|
|
|
self.read_int_option(section, "recursionlevel", min=-1)
|
2012-08-23 02:52:25 +00:00
|
|
|
self.read_string_option(section, "nntpserver")
|
|
|
|
|
self.read_string_option(section, "useragent")
|
2014-02-28 23:12:34 +00:00
|
|
|
self.read_int_option(section, "maxrequestspersecond", min=1)
|
|
|
|
|
self.read_int_option(section, "maxnumurls", min=0)
|
|
|
|
|
self.read_int_option(section, "maxfilesizeparse", min=1)
|
|
|
|
|
self.read_int_option(section, "maxfilesizedownload", min=1)
|
|
|
|
|
if self.has_option(section, "allowedschemes"):
|
2020-05-30 16:01:36 +00:00
|
|
|
self.config['allowedschemes'] = [
|
|
|
|
|
x.strip().lower()
|
|
|
|
|
for x in self.get(section, 'allowedschemes').split(',')
|
|
|
|
|
]
|
2012-06-10 11:18:35 +00:00
|
|
|
self.read_boolean_option(section, "debugmemory")
|
2012-08-23 02:52:25 +00:00
|
|
|
self.read_string_option(section, "cookiefile")
|
add --no-robots commandline flag
While this flag can be abused, it seems to me like a legitimate use
case that you want to check a fairly small document for mistakes,
which includes references to a website which has a robots.txt that
denies all robots. It turns out that most websites do *not* add a
permission for LinkCheck to use their site, and some sites, like the
Debian BTS for example, are very hostile with bots in general.
Between me using linkcheck and me using my web browser to check those
links one by one, there is not a big difference. In fact, using
linkcheck may be *better* for the website because it will use HEAD
requests instead of a GET, and will not fetch all page elements
(javascript, images, etc) which can often be fairly big.
Besides, hostile users will patch the software themselves: it took me
only a few minutes to disable the check, and a few more to make that
into a proper patch.
By forcing robots.txt without any other option, we are hurting our
good users and not keeping hostile users from doing harm.
The patch is still incomplete, but works. It lacks: documentation and
unit tests.
Closes: #508
2016-05-19 18:43:58 +00:00
|
|
|
self.read_boolean_option(section, "robotstxt")
|
2012-08-23 02:52:25 +00:00
|
|
|
self.read_string_option(section, "localwebroot")
|
2013-12-12 21:17:57 +00:00
|
|
|
try:
|
|
|
|
|
self.read_boolean_option(section, "sslverify")
|
|
|
|
|
except ValueError:
|
|
|
|
|
self.read_string_option(section, "sslverify")
|
2012-10-10 08:53:52 +00:00
|
|
|
self.read_int_option(section, "maxrunseconds", min=0)
|
2022-09-08 13:29:53 +00:00
|
|
|
self.read_int_option(section, "resultcachesize", min=0)
|
2006-05-03 18:24:46 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read_authentication_config(self):
|
2007-12-01 15:50:33 +00:00
|
|
|
"""Read configuration options in section "authentication"."""
|
2006-05-03 18:24:46 +00:00
|
|
|
section = "authentication"
|
2012-10-15 12:36:10 +00:00
|
|
|
password_fields = []
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "entry"):
|
|
|
|
|
for val in read_multiline(self.get(section, "entry")):
|
|
|
|
|
auth = val.split()
|
2010-10-25 15:33:13 +00:00
|
|
|
if len(auth) == 3:
|
2020-05-30 16:01:36 +00:00
|
|
|
self.config.add_auth(
|
|
|
|
|
pattern=auth[0], user=auth[1], password=auth[2]
|
|
|
|
|
)
|
2012-10-15 12:36:10 +00:00
|
|
|
password_fields.append("entry/%s/%s" % (auth[0], auth[1]))
|
2010-10-25 15:33:13 +00:00
|
|
|
elif len(auth) == 2:
|
2010-10-25 20:07:16 +00:00
|
|
|
self.config.add_auth(pattern=auth[0], user=auth[1])
|
2010-10-25 15:33:13 +00:00
|
|
|
else:
|
2011-10-09 19:45:56 +00:00
|
|
|
raise LinkCheckerError(
|
2020-05-30 16:01:36 +00:00
|
|
|
_("missing auth part in entry %(val)r") % {"val": val}
|
|
|
|
|
)
|
2010-10-14 16:36:11 +00:00
|
|
|
# read login URL and field names
|
|
|
|
|
if self.has_option(section, "loginurl"):
|
|
|
|
|
val = self.get(section, "loginurl").strip()
|
2020-05-30 16:01:36 +00:00
|
|
|
if not (
|
|
|
|
|
val.lower().startswith("http:") or val.lower().startswith("https:")
|
|
|
|
|
):
|
|
|
|
|
raise LinkCheckerError(
|
|
|
|
|
_(
|
|
|
|
|
"invalid login URL `%s'. Only "
|
|
|
|
|
"HTTP and HTTPS URLs are supported."
|
|
|
|
|
)
|
|
|
|
|
% val
|
|
|
|
|
)
|
2010-10-14 16:36:11 +00:00
|
|
|
self.config["loginurl"] = val
|
2012-08-23 02:52:25 +00:00
|
|
|
self.read_string_option(section, "loginuserfield")
|
|
|
|
|
self.read_string_option(section, "loginpasswordfield")
|
2010-10-14 16:36:11 +00:00
|
|
|
# read login extra fields
|
|
|
|
|
if self.has_option(section, "loginextrafields"):
|
|
|
|
|
for val in read_multiline(self.get(section, "loginextrafields")):
|
|
|
|
|
name, value = val.split(":", 1)
|
|
|
|
|
self.config["loginextrafields"][name] = value
|
2012-10-15 12:36:10 +00:00
|
|
|
self.check_password_readable(section, password_fields)
|
|
|
|
|
|
|
|
|
|
def check_password_readable(self, section, fields):
|
|
|
|
|
"""Check if there is a readable configuration file and print a warning."""
|
|
|
|
|
if not fields:
|
|
|
|
|
return
|
|
|
|
|
# The information which of the configuration files
|
|
|
|
|
# included which option is not available. To avoid false positives,
|
|
|
|
|
# a warning is only printed if exactly one file has been read.
|
|
|
|
|
if len(self.read_ok) != 1:
|
|
|
|
|
return
|
|
|
|
|
fn = self.read_ok[0]
|
|
|
|
|
if fileutil.is_accessable_by_others(fn):
|
2020-05-30 16:01:36 +00:00
|
|
|
log.warn(
|
|
|
|
|
LOG_CHECK,
|
2020-06-05 15:59:46 +00:00
|
|
|
_(
|
|
|
|
|
"The configuration file %s contains password information (in"
|
|
|
|
|
" section [%s] and options %s) and the file is readable by"
|
|
|
|
|
" others. Please make the file only readable by you."
|
|
|
|
|
),
|
2020-05-30 16:01:36 +00:00
|
|
|
fn,
|
|
|
|
|
section,
|
|
|
|
|
fields,
|
|
|
|
|
)
|
2012-10-15 12:36:10 +00:00
|
|
|
if os.name == 'posix':
|
|
|
|
|
log.warn(LOG_CHECK, _("For example execute 'chmod go-rw %s'.") % fn)
|
|
|
|
|
elif os.name == 'nt':
|
2020-05-30 16:01:36 +00:00
|
|
|
log.warn(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
_(
|
2020-06-05 15:59:46 +00:00
|
|
|
"See %(url)s for more info on setting file permissions."
|
|
|
|
|
) % {"url": "https://support.microsoft.com/kb/308419"}
|
2020-05-30 16:01:36 +00:00
|
|
|
)
|
2006-05-03 18:24:46 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def read_filtering_config(self):
|
2006-05-03 18:24:46 +00:00
|
|
|
"""
|
|
|
|
|
Read configuration options in section "filtering".
|
|
|
|
|
"""
|
|
|
|
|
section = "filtering"
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "ignorewarnings"):
|
2020-05-30 16:01:36 +00:00
|
|
|
self.config['ignorewarnings'] = [
|
|
|
|
|
f.strip().lower()
|
|
|
|
|
for f in self.get(section, 'ignorewarnings').split(',')
|
|
|
|
|
]
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "ignore"):
|
|
|
|
|
for line in read_multiline(self.get(section, "ignore")):
|
2008-05-09 06:16:03 +00:00
|
|
|
pat = get_link_pat(line, strict=1)
|
2006-09-21 14:30:27 +00:00
|
|
|
self.config["externlinks"].append(pat)
|
2011-02-18 06:47:22 +00:00
|
|
|
if self.has_option(section, "nofollow"):
|
|
|
|
|
for line in read_multiline(self.get(section, "nofollow")):
|
|
|
|
|
pat = get_link_pat(line, strict=0)
|
|
|
|
|
self.config["externlinks"].append(pat)
|
2007-12-01 15:50:33 +00:00
|
|
|
if self.has_option(section, "internlinks"):
|
2008-05-09 06:16:03 +00:00
|
|
|
pat = get_link_pat(self.get(section, "internlinks"))
|
2007-12-01 15:50:33 +00:00
|
|
|
self.config["internlinks"].append(pat)
|
2014-02-28 23:12:34 +00:00
|
|
|
self.read_boolean_option(section, "checkextern")
|
|
|
|
|
|
|
|
|
|
def read_plugin_config(self):
|
|
|
|
|
"""Read plugin-specific configuration values."""
|
|
|
|
|
folders = self.config["pluginfolders"]
|
|
|
|
|
modules = plugins.get_plugin_modules(folders)
|
|
|
|
|
for pluginclass in plugins.get_plugin_classes(modules):
|
|
|
|
|
section = pluginclass.__name__
|
|
|
|
|
if self.has_section(section):
|
|
|
|
|
self.config["enabledplugins"].append(section)
|
|
|
|
|
self.config[section] = pluginclass.read_config(self)
|