mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-18 23:10:32 +00:00
388 lines
14 KiB
Python
388 lines
14 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2000-2010 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
"""
|
|
Store metadata and options.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
import logging.config
|
|
import urllib
|
|
import urlparse
|
|
import shutil
|
|
import _LinkChecker_configdata as configdata
|
|
from .. import (log, LOG_CHECK, LOG_ROOT, ansicolor, lognames, clamav,
|
|
get_config_dir)
|
|
from . import confparse
|
|
|
|
Version = configdata.version
|
|
AppName = configdata.appname
|
|
App = AppName+u" "+Version
|
|
Author = configdata.author
|
|
HtmlAuthor = Author.replace(u' ', u' ')
|
|
Copyright = u"Copyright (C) 2000-2010 "+Author
|
|
HtmlCopyright = u"Copyright © 2000-2010 "+HtmlAuthor
|
|
AppInfo = App+u" "+Copyright
|
|
HtmlAppInfo = App+u", "+HtmlCopyright
|
|
Url = configdata.url
|
|
SupportUrl = u"http://sourceforge.net/projects/linkchecker/support"
|
|
Email = configdata.author_email
|
|
UserAgent = u"%s/%s (+%s)" % (AppName, Version, Url)
|
|
Freeware = AppName+u""" comes with ABSOLUTELY NO WARRANTY!
|
|
This is free software, and you are welcome to redistribute it
|
|
under certain conditions. Look at the file `LICENSE' within this
|
|
distribution."""
|
|
|
|
|
|
def normpath (path):
|
|
"""
|
|
Norm given system path with all available norm funcs in os.path.
|
|
"""
|
|
return os.path.normcase(os.path.normpath(os.path.expanduser(path)))
|
|
|
|
|
|
# dynamic options
|
|
class Configuration (dict):
|
|
"""
|
|
Storage for configuration options. Options can both be given from
|
|
the command line as well as from configuration files.
|
|
"""
|
|
|
|
def __init__ (self):
|
|
"""
|
|
Initialize the default options.
|
|
"""
|
|
super(Configuration, self).__init__()
|
|
self['trace'] = False
|
|
self["verbose"] = False
|
|
self["complete"] = False
|
|
self["warnings"] = True
|
|
self["ignorewarnings"] = []
|
|
self['quiet'] = False
|
|
self["anchors"] = False
|
|
self["externlinks"] = []
|
|
self["internlinks"] = []
|
|
self["interactive"] = False
|
|
# on ftp, password is set by Pythons ftplib
|
|
self["authentication"] = []
|
|
self["loginurl"] = None
|
|
self["loginuserfield"] = "login"
|
|
self["loginpasswordfield"] = "password"
|
|
self["loginextrafields"] = {}
|
|
self["proxy"] = urllib.getproxies()
|
|
self["recursionlevel"] = -1
|
|
self["wait"] = 0
|
|
self['sendcookies'] = False
|
|
self['storecookies'] = False
|
|
self["status"] = False
|
|
self["status_wait_seconds"] = 5
|
|
self["fileoutput"] = []
|
|
# Logger configurations
|
|
self["text"] = {
|
|
"filename": "linkchecker-out.txt",
|
|
'colorparent': "default",
|
|
'colorurl': "default",
|
|
'colorname': "default",
|
|
'colorreal': "cyan",
|
|
'colorbase': "purple",
|
|
'colorvalid': "bold;green",
|
|
'colorinvalid': "bold;red",
|
|
'colorinfo': "default",
|
|
'colorwarning': "bold;yellow",
|
|
'colordltime': "default",
|
|
'colordlsize': "default",
|
|
'colorreset': "default",
|
|
}
|
|
self['html'] = {
|
|
"filename": "linkchecker-out.html",
|
|
'colorbackground': '#fff7e5',
|
|
'colorurl': '#dcd5cf',
|
|
'colorborder': '#000000',
|
|
'colorlink': '#191c83',
|
|
'colorwarning': '#e0954e',
|
|
'colorerror': '#db4930',
|
|
'colorok': '#3ba557',
|
|
}
|
|
self['gml'] = {
|
|
"filename": "linkchecker-out.gml",
|
|
}
|
|
self['sql'] = {
|
|
"filename": "linkchecker-out.sql",
|
|
'separator': ';',
|
|
'dbname': 'linksdb',
|
|
}
|
|
self['csv'] = {
|
|
"filename": "linkchecker-out.csv",
|
|
'separator': ';',
|
|
"quotechar": '"',
|
|
}
|
|
self['blacklist'] = {
|
|
"filename": "~/.linkchecker/blacklist",
|
|
}
|
|
self['xml'] = {
|
|
"filename": "linkchecker-out.xml",
|
|
}
|
|
self['gxml'] = {
|
|
"filename": "linkchecker-out.gxml",
|
|
}
|
|
self['dot'] = {
|
|
"filename": "linkchecker-out.dot",
|
|
"encoding": "ascii",
|
|
}
|
|
self['none'] = {}
|
|
self['output'] = 'text'
|
|
self['logger'] = None
|
|
self["warningregex"] = None
|
|
self["warnsizebytes"] = None
|
|
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
|
|
self["threads"] = 10
|
|
# socket timeout in seconds
|
|
self["timeout"] = 60
|
|
self["checkhtml"] = False
|
|
self["checkcss"] = False
|
|
self["checkhtmlw3"] = False
|
|
self["checkcssw3"] = False
|
|
self["scanvirus"] = False
|
|
self["clamavconf"] = clamav.canonical_clamav_conf()
|
|
|
|
def init_logging (self, status_logger, debug=None, handler=None):
|
|
"""
|
|
Load logging.conf file settings to set up the
|
|
application logging (not to be confused with check loggers).
|
|
When debug is not None it is expected to be a list of
|
|
logger names for which debugging will be enabled.
|
|
|
|
If no thread debugging is enabled, threading will be disabled.
|
|
"""
|
|
filename = normpath(os.path.join(get_config_dir(), "logging.conf"))
|
|
if os.path.isfile(filename):
|
|
logging.config.fileConfig(filename)
|
|
if handler is None:
|
|
handler = ansicolor.ColoredStreamHandler(strm=sys.stderr)
|
|
self.add_loghandler(handler)
|
|
self.set_debug(debug)
|
|
self.status_logger = status_logger
|
|
|
|
def set_debug (self, debug):
|
|
"""Set debugging levels for configured loggers. The argument
|
|
is a list of logger names to enable debug for."""
|
|
self.set_loglevel(debug, logging.DEBUG)
|
|
|
|
def add_loghandler (self, handler):
|
|
"""Add log handler to root logger LOG_ROOT and set formatting."""
|
|
logging.getLogger(LOG_ROOT).addHandler(handler)
|
|
if self['threads'] > 0:
|
|
format = "%(levelname)s %(threadName)s %(message)s"
|
|
else:
|
|
format = "%(levelname)s %(message)s"
|
|
handler.setFormatter(logging.Formatter(format))
|
|
|
|
def remove_loghandler (self, handler):
|
|
"""Remove log handler from root logger LOG_ROOT."""
|
|
logging.getLogger(LOG_ROOT).removeHandler(handler)
|
|
|
|
def reset_loglevel (self):
|
|
"""Reset log level to display only warnings and errors."""
|
|
self.set_loglevel(['all'], logging.WARN)
|
|
|
|
def set_loglevel (self, loggers, level):
|
|
"""Set logging levels for given loggers."""
|
|
if not loggers:
|
|
return
|
|
if 'all' in loggers:
|
|
loggers = lognames.keys()
|
|
# disable threading if no thread debugging
|
|
if "thread" not in loggers and level == logging.DEBUG:
|
|
self['threads'] = 0
|
|
for key in loggers:
|
|
logging.getLogger(lognames[key]).setLevel(level)
|
|
|
|
def logger_new (self, loggertype, **kwargs):
|
|
"""
|
|
Instantiate new logger and return it.
|
|
"""
|
|
args = {}
|
|
args.update(self[loggertype])
|
|
args.update(kwargs)
|
|
from ..logger import Loggers
|
|
return Loggers[loggertype](**args)
|
|
|
|
def logger_add (self, loggertype, loggerclass, loggerargs=None):
|
|
"""
|
|
Add a new logger type to the known loggers.
|
|
"""
|
|
if loggerargs is None:
|
|
loggerargs = {}
|
|
from ..logger import Loggers
|
|
Loggers[loggertype] = loggerclass
|
|
self[loggertype] = loggerargs
|
|
|
|
def read (self, files=None):
|
|
"""
|
|
Read settings from given config files.
|
|
|
|
@raises: LinkCheckerError on syntax errors in the config file(s)
|
|
"""
|
|
if files is None:
|
|
cfiles = []
|
|
else:
|
|
cfiles = files[:]
|
|
if not cfiles:
|
|
cfiles.extend(get_standard_config_files())
|
|
# weed out invalid files
|
|
cfiles = [f for f in cfiles if os.path.isfile(f)]
|
|
log.debug(LOG_CHECK, "reading configuration from %s", cfiles)
|
|
confparse.LCConfigParser(self).read(cfiles)
|
|
self.sanitize()
|
|
|
|
def add_auth (self, user=None, password=None, pattern=None):
|
|
if not user or not pattern:
|
|
log.warn(LOG_CHECK,
|
|
_("warning: missing user or URL pattern in authentication data."))
|
|
return
|
|
entry = dict(
|
|
user=user,
|
|
password=password,
|
|
pattern=re.compile(pattern),
|
|
)
|
|
self["authentication"].append(entry)
|
|
|
|
def get_user_password (self, url):
|
|
"""Get tuple (user, password) from configured authentication
|
|
that matches the given URL.
|
|
Both user and password can be None if not specified, or no
|
|
authentication matches the given URL.
|
|
"""
|
|
for auth in self["authentication"]:
|
|
if auth['pattern'].match(url):
|
|
return (auth['user'], auth['password'])
|
|
return (None, None)
|
|
|
|
def sanitize (self):
|
|
"Make sure the configuration is consistent."
|
|
if self["anchors"]:
|
|
self.sanitize_anchors()
|
|
if self['logger'] is None:
|
|
self.sanitize_logger()
|
|
if self['checkhtml']:
|
|
self.sanitize_checkhtml()
|
|
if self['checkcss']:
|
|
self.sanitize_checkcss()
|
|
if self['scanvirus']:
|
|
self.sanitize_scanvirus()
|
|
if self['storecookies']:
|
|
self.sanitize_cookies()
|
|
if self['loginurl']:
|
|
self.sanitize_loginurl()
|
|
|
|
def sanitize_anchors (self):
|
|
if not self["warnings"]:
|
|
self["warnings"] = True
|
|
from ..checker import Warnings
|
|
self["ignorewarnings"] = Warnings.keys()
|
|
if 'url-anchor-not-found' in self["ignorewarnings"]:
|
|
self["ignorewarnings"].remove('url-anchor-not-found')
|
|
|
|
def sanitize_logger (self):
|
|
if not self['output']:
|
|
log.warn(LOG_CHECK, _("warning: activating text logger output."))
|
|
self['output'] = 'text'
|
|
self['logger'] = self.logger_new(self['output'])
|
|
|
|
def sanitize_checkhtml (self):
|
|
try:
|
|
import tidy
|
|
except ImportError:
|
|
log.warn(LOG_CHECK,
|
|
_("warning: tidy module is not available; " \
|
|
"download from http://utidylib.berlios.de/"))
|
|
self['checkhtml'] = False
|
|
|
|
def sanitize_checkcss (self):
|
|
try:
|
|
import cssutils
|
|
except ImportError:
|
|
log.warn(LOG_CHECK,
|
|
_("warning: cssutils module is not available; " \
|
|
"download from http://cthedot.de/cssutils/"))
|
|
self['checkcss'] = False
|
|
|
|
def sanitize_scanvirus (self):
|
|
try:
|
|
clamav.init_clamav_conf(self['clamavconf'])
|
|
except clamav.ClamavError:
|
|
log.warn(LOG_CHECK,
|
|
_("warning: Clamav could not be initialized"))
|
|
self['scanvirus'] = False
|
|
|
|
def sanitize_cookies (self):
|
|
if not self['sendcookies']:
|
|
log.warn(LOG_CHECK, _("warning: activating sendcookies " \
|
|
"because storecookies is active."))
|
|
self['sendcookies'] = True
|
|
|
|
def sanitize_loginurl (self):
|
|
url = self["loginurl"]
|
|
disable = False
|
|
if not self["loginpasswordfield"]:
|
|
log.warn(LOG_CHECK,
|
|
_("warning: no CGI password fieldname given for login URL."))
|
|
disable = True
|
|
if not self["loginuserfield"]:
|
|
log.warn(LOG_CHECK,
|
|
_("warning: no CGI user fieldname given for login URL."))
|
|
disable = True
|
|
if self.get_user_password(url) == (None, None):
|
|
log.warn(LOG_CHECK,
|
|
_("warning: no user/password authentication data found for login URL."))
|
|
disable = True
|
|
if not url.lower().startswith(("http:", "https:")):
|
|
log.warn(LOG_CHECK, _("warning: login URL is not a HTTP URL."))
|
|
disable = True
|
|
urlparts = urlparse.urlsplit(url)
|
|
if not urlparts[0] or not urlparts[1] or not urlparts[2]:
|
|
log.warn(LOG_CHECK, _("warning: login URL is incomplete."))
|
|
disable = True
|
|
if disable:
|
|
log.warn(LOG_CHECK,
|
|
_("warning: disabling login URL %(url)s.") % {"url": url})
|
|
self["loginurl"] = None
|
|
elif not self['storecookies']:
|
|
# login URL implies storing and sending cookies
|
|
self['storecookies'] = self['sendcookies'] = True
|
|
|
|
|
|
def get_standard_config_files ():
|
|
"""Try to generate user configuration file from the system wide
|
|
configuration.
|
|
Returns tuple (system config file, user config file)."""
|
|
# system wide config settings
|
|
syspath = normpath(os.path.join(get_config_dir(), "linkcheckerrc"))
|
|
# per user config settings
|
|
userpath = normpath("~/.linkchecker/linkcheckerrc")
|
|
if os.path.isfile(syspath) and not os.path.exists(userpath):
|
|
# copy the system configuration to the user configuration
|
|
try:
|
|
userdir = os.path.dirname(userpath)
|
|
if not os.path.exists(userdir):
|
|
os.makedirs(userdir)
|
|
shutil.copy(syspath, userpath)
|
|
except StandardError, msg:
|
|
log.warn(LOG_CHECK, "could not copy system config from %r to %r",
|
|
syspath, userpath)
|
|
return (syspath, userpath)
|