mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-03 22:50:30 +00:00
While this flag can be abused, it seems to me like a legitimate use case that you want to check a fairly small document for mistakes, which includes references to a website which has a robots.txt that denies all robots. It turns out that most websites do *not* add a permission for LinkCheck to use their site, and some sites, like the Debian BTS for example, are very hostile with bots in general. Between me using linkcheck and me using my web browser to check those links one by one, there is not a big difference. In fact, using linkcheck may be *better* for the website because it will use HEAD requests instead of a GET, and will not fetch all page elements (javascript, images, etc) which can often be fairly big. Besides, hostile users will patch the software themselves: it took me only a few minutes to disable the check, and a few more to make that into a proper patch. By forcing robots.txt without any other option, we are hurting our good users and not keeping hostile users from doing harm. The patch is still incomplete, but works. It lacks: documentation and unit tests. Closes: #508
239 lines
11 KiB
Python
239 lines
11 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2000-2014 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
"""Parse configuration files"""
|
|
|
|
import ConfigParser
|
|
import os
|
|
from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins, logconf
|
|
|
|
|
|
def read_multiline (value):
|
|
"""Helper function reading multiline values."""
|
|
for line in value.splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
yield line
|
|
|
|
|
|
class LCConfigParser (ConfigParser.RawConfigParser, object):
|
|
"""
|
|
Parse a LinkChecker configuration file.
|
|
"""
|
|
|
|
def __init__ (self, config):
|
|
"""Initialize configuration."""
|
|
super(LCConfigParser, self).__init__()
|
|
self.config = config
|
|
|
|
def read (self, files):
|
|
"""Read settings from given config files.
|
|
|
|
@raises: LinkCheckerError on syntax errors in the config file(s)
|
|
"""
|
|
assert isinstance(files, list), "Invalid file list %r" % files
|
|
try:
|
|
self.read_ok = super(LCConfigParser, self).read(files)
|
|
if len(self.read_ok) < len(files):
|
|
failed_files = set(files) - set(self.read_ok)
|
|
log.warn(LOG_CHECK, "Could not read configuration files %s.", failed_files)
|
|
# Read all the configuration parameters from the given files.
|
|
self.read_checking_config()
|
|
self.read_authentication_config()
|
|
self.read_filtering_config()
|
|
self.read_output_config()
|
|
self.read_plugin_config()
|
|
except Exception as msg:
|
|
raise LinkCheckerError(
|
|
_("Error parsing configuration: %s") % unicode(msg))
|
|
|
|
def read_string_option (self, section, option, allowempty=False):
|
|
"""Read a string option."""
|
|
if self.has_option(section, option):
|
|
value = self.get(section, option)
|
|
if not allowempty and not value:
|
|
raise LinkCheckerError(_("invalid empty value for %s: %s\n") % (option, value))
|
|
self.config[option] = value
|
|
|
|
def read_boolean_option(self, section, option):
|
|
"""Read a boolean option."""
|
|
if self.has_option(section, option):
|
|
self.config[option] = self.getboolean(section, option)
|
|
|
|
def read_int_option (self, section, option, key=None, min=None, max=None):
|
|
"""Read an integer option."""
|
|
if self.has_option(section, option):
|
|
num = self.getint(section, option)
|
|
if min is not None and num < min:
|
|
raise LinkCheckerError(
|
|
_("invalid value for %s: %d must not be less than %d") % (option, num, min))
|
|
if max is not None and num < max:
|
|
raise LinkCheckerError(
|
|
_("invalid value for %s: %d must not be greater than %d") % (option, num, max))
|
|
if key is None:
|
|
key = option
|
|
self.config[key] = num
|
|
|
|
def read_output_config (self):
|
|
"""Read configuration options in section "output"."""
|
|
section = "output"
|
|
from ..logger import LoggerClasses
|
|
for c in LoggerClasses:
|
|
key = c.LoggerName
|
|
if self.has_section(key):
|
|
for opt in self.options(key):
|
|
self.config[key][opt] = self.get(key, opt)
|
|
if self.has_option(key, 'parts'):
|
|
val = self.get(key, 'parts')
|
|
parts = [f.strip().lower() for f in val.split(',')]
|
|
self.config[key]['parts'] = parts
|
|
self.read_boolean_option(section, "warnings")
|
|
if self.has_option(section, "verbose"):
|
|
if self.getboolean(section, "verbose"):
|
|
self.config["verbose"] = True
|
|
self.config["warnings"] = True
|
|
if self.has_option(section, "quiet"):
|
|
if self.getboolean(section, "quiet"):
|
|
self.config['output'] = 'none'
|
|
self.config['quiet'] = True
|
|
if self.has_option(section, "debug"):
|
|
val = self.get(section, "debug")
|
|
parts = [f.strip().lower() for f in val.split(',')]
|
|
logconf.set_debug(parts)
|
|
self.read_boolean_option(section, "status")
|
|
if self.has_option(section, "log"):
|
|
val = self.get(section, "log").strip().lower()
|
|
self.config['output'] = val
|
|
if self.has_option(section, "fileoutput"):
|
|
loggers = self.get(section, "fileoutput").split(",")
|
|
# strip names from whitespace
|
|
loggers = (x.strip().lower() for x in loggers)
|
|
# no file output for the blacklist and none Logger
|
|
from ..logger import LoggerNames
|
|
loggers = (x for x in loggers if x in LoggerNames and
|
|
x not in ("blacklist", "none"))
|
|
for val in loggers:
|
|
output = self.config.logger_new(val, fileoutput=1)
|
|
self.config['fileoutput'].append(output)
|
|
|
|
def read_checking_config (self):
|
|
"""Read configuration options in section "checking"."""
|
|
section = "checking"
|
|
self.read_int_option(section, "threads", min=-1)
|
|
self.config['threads'] = max(0, self.config['threads'])
|
|
self.read_int_option(section, "timeout", min=1)
|
|
self.read_int_option(section, "aborttimeout", min=1)
|
|
self.read_int_option(section, "recursionlevel", min=-1)
|
|
self.read_string_option(section, "nntpserver")
|
|
self.read_string_option(section, "useragent")
|
|
self.read_int_option(section, "maxrequestspersecond", min=1)
|
|
self.read_int_option(section, "maxnumurls", min=0)
|
|
self.read_int_option(section, "maxfilesizeparse", min=1)
|
|
self.read_int_option(section, "maxfilesizedownload", min=1)
|
|
if self.has_option(section, "allowedschemes"):
|
|
self.config['allowedschemes'] = [x.strip().lower() for x in \
|
|
self.get(section, 'allowedschemes').split(',')]
|
|
self.read_boolean_option(section, "debugmemory")
|
|
self.read_string_option(section, "cookiefile")
|
|
self.read_boolean_option(section, "robotstxt")
|
|
self.read_string_option(section, "localwebroot")
|
|
try:
|
|
self.read_boolean_option(section, "sslverify")
|
|
except ValueError:
|
|
self.read_string_option(section, "sslverify")
|
|
self.read_int_option(section, "maxrunseconds", min=0)
|
|
|
|
def read_authentication_config (self):
|
|
"""Read configuration options in section "authentication"."""
|
|
section = "authentication"
|
|
password_fields = []
|
|
if self.has_option(section, "entry"):
|
|
for val in read_multiline(self.get(section, "entry")):
|
|
auth = val.split()
|
|
if len(auth) == 3:
|
|
self.config.add_auth(pattern=auth[0], user=auth[1],
|
|
password=auth[2])
|
|
password_fields.append("entry/%s/%s" % (auth[0], auth[1]))
|
|
elif len(auth) == 2:
|
|
self.config.add_auth(pattern=auth[0], user=auth[1])
|
|
else:
|
|
raise LinkCheckerError(
|
|
_("missing auth part in entry %(val)r") % {"val": val})
|
|
# read login URL and field names
|
|
if self.has_option(section, "loginurl"):
|
|
val = self.get(section, "loginurl").strip()
|
|
if not (val.lower().startswith("http:") or
|
|
val.lower().startswith("https:")):
|
|
raise LinkCheckerError(_("invalid login URL `%s'. Only " \
|
|
"HTTP and HTTPS URLs are supported.") % val)
|
|
self.config["loginurl"] = val
|
|
self.read_string_option(section, "loginuserfield")
|
|
self.read_string_option(section, "loginpasswordfield")
|
|
# read login extra fields
|
|
if self.has_option(section, "loginextrafields"):
|
|
for val in read_multiline(self.get(section, "loginextrafields")):
|
|
name, value = val.split(":", 1)
|
|
self.config["loginextrafields"][name] = value
|
|
self.check_password_readable(section, password_fields)
|
|
|
|
def check_password_readable(self, section, fields):
|
|
"""Check if there is a readable configuration file and print a warning."""
|
|
if not fields:
|
|
return
|
|
# The information which of the configuration files
|
|
# included which option is not available. To avoid false positives,
|
|
# a warning is only printed if exactly one file has been read.
|
|
if len(self.read_ok) != 1:
|
|
return
|
|
fn = self.read_ok[0]
|
|
if fileutil.is_accessable_by_others(fn):
|
|
log.warn(LOG_CHECK, "The configuration file %s contains password information (in section [%s] and options %s) and the file is readable by others. Please make the file only readable by you.", fn, section, fields)
|
|
if os.name == 'posix':
|
|
log.warn(LOG_CHECK, _("For example execute 'chmod go-rw %s'.") % fn)
|
|
elif os.name == 'nt':
|
|
log.warn(LOG_CHECK, _("See http://support.microsoft.com/kb/308419 for more info on setting file permissions."))
|
|
|
|
def read_filtering_config (self):
|
|
"""
|
|
Read configuration options in section "filtering".
|
|
"""
|
|
section = "filtering"
|
|
if self.has_option(section, "ignorewarnings"):
|
|
self.config['ignorewarnings'] = [f.strip().lower() for f in \
|
|
self.get(section, 'ignorewarnings').split(',')]
|
|
if self.has_option(section, "ignore"):
|
|
for line in read_multiline(self.get(section, "ignore")):
|
|
pat = get_link_pat(line, strict=1)
|
|
self.config["externlinks"].append(pat)
|
|
if self.has_option(section, "nofollow"):
|
|
for line in read_multiline(self.get(section, "nofollow")):
|
|
pat = get_link_pat(line, strict=0)
|
|
self.config["externlinks"].append(pat)
|
|
if self.has_option(section, "internlinks"):
|
|
pat = get_link_pat(self.get(section, "internlinks"))
|
|
self.config["internlinks"].append(pat)
|
|
self.read_boolean_option(section, "checkextern")
|
|
|
|
def read_plugin_config(self):
|
|
"""Read plugin-specific configuration values."""
|
|
folders = self.config["pluginfolders"]
|
|
modules = plugins.get_plugin_modules(folders)
|
|
for pluginclass in plugins.get_plugin_classes(modules):
|
|
section = pluginclass.__name__
|
|
if self.has_section(section):
|
|
self.config["enabledplugins"].append(section)
|
|
self.config[section] = pluginclass.read_config(self)
|