linkchecker/linkchecker
2005-05-09 22:05:21 +00:00

679 lines
26 KiB
Python
Executable file

#!/usr/bin/python2.4
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Check HTML pages for broken links.
"""
import sys
import getopt
import codecs
import re
import os
import pprint
import socket
import optparse
# set default 60 seconds socket timeout before importing anything else
default_timeout = 60
socket.setdefaulttimeout(default_timeout)
import linkcheck
# initialize i18n, puts _() function into global namespace
linkcheck.init_i18n()
# override optparse gettext method with the one from init_i18n()
optparse._ = _
# now import the rest of the linkchecker gang
import linkcheck.log
import linkcheck.checker
import linkcheck.checker.cache
import linkcheck.checker.consumer
import linkcheck.configuration
import linkcheck.strformat
# optional modules
try:
import optcomplete
has_optcomplete = True
except ImportError:
has_optcomplete = False
try:
import profile
has_profile = True
except ImportError:
has_profile = False
try:
import pstats
has_pstats = True
except ImportError:
has_pstats = False
# default profiling filename
_profile = "linkchecker.prof"
_username = None
_password = None
# main usage text
Usage = _("""USAGE\tlinkchecker [options] file-or-url...
""")
Notes = _("""NOTES
o A ! before a regular expression negates it. So '!^mailto:' matches
everything but a mailto link.
o URLs on the command line starting with "ftp." are treated like
"ftp://ftp.", URLs starting with "www." are treated like "http://www.".
You can also give local files as arguments.
o If you have your system configured to automatically establish a
connection to the internet (e.g. with diald), it will connect when
checking links not pointing to your local system.
See the --ignore-url option on how to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker disables it
automatically.
o You can supply multiple user/password pairs in a configuration file.
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
on Unix or Windows.
On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
same as the host of the user browsing your pages!
""")
Retval = _(r"""RETURN VALUE
The return value is non-zero when
o invalid links were found or
o link warnings were found and --warnings option was given
o a program error occurred
""")
Examples = _(r"""EXAMPLES
The most common use checks the given domain recursively, plus any
single URL pointing outside of the domain:
linkchecker http://treasure.calvinsplayground.de/
Beware that this checks the whole site which can have several hundred
thousands URLs. Use the -r option to restrict the recursion depth.
Don't connect to mailto: hosts, only check their URL syntax. All other
links are checked as usual:
linkchecker --ignore-url=^mailto: www.mysite.org
Checking a local HTML file on Unix:
linkchecker ../bla.html
Checking a local HTML file on Windows:
linkchecker c:\temp\test.html
You can skip the "http://" url part if the domain starts with "www.":
linkchecker www.myhomepage.de
You can skip the "ftp://" url part if the domain starts with "ftp.":
linkchecker -r0 ftp.linux.org
""")
Logertypes = _(r"""OUTPUT TYPES
Note that by default only errors are logged.
text Standard text output, logging URLs in keyword: argument fashion.
html Log URLs in keyword: argument fashion, formatted as HTML.
Additionally has links to the referenced pages. Invalid URLs have
HTML and CSS syntax check links appended.
csv Log check result in CSV format with one URL per line.
gml Log parent-child relations between linked URLs as a GML graph.
You should use the --verbose option to get a complete graph.
dot Log parent-child relations between linked URLs as a DOT graph.
You should use the --verbose option to get a complete graph.
xml Log check result as machine-readable XML file.
sql Log check result as SQL script with INSERT commands. An example
script to create the initial SQL table is included as create.sql.
blacklist
Suitable for cron jobs. Logs the check result into a file
~/.linkchecker/blacklist which only contains entries with invalid
URLs and the number of times they have failed.
none Logs nothing. Suitable for scripts.
""")
def encode (s, codec="iso8859-15"):
"""
Encode string with given codec for screen print.
"""
return s.encode(codec, "ignore")
def print_version ():
"""
Print the program version and exit.
"""
print encode(linkcheck.configuration.AppInfo)
sys.exit(0)
def print_usage (msg):
"""
Print a program msg text to stderr and exit.
"""
sys.stderr.write(encode(_("Error: %s") % msg))
sys.stderr.write(os.linesep)
sys.stderr.write(encode(_("Execute 'linkchecker -h' for help")))
sys.stderr.write(os.linesep)
sys.exit(1)
def viewprof ():
"""
Print profiling data and exit.
"""
if not has_pstats:
linkcheck.log.error(linkcheck.LOG_CMDLINE,
_("The `pstats' Python module is not installed,"
" therefore the --viewprof option is disabled."))
sys.exit(1)
if not os.path.exists(_profile):
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
_("Could not find profiling file %r.") % _profile)
sys.stderr.write(
_("Please run linkchecker with --profile to generate it."))
sys.stderr.write(os.linesep)
sys.exit(1)
stats = pstats.Stats(_profile)
stats.strip_dirs().sort_stats("cumulative").print_stats(100)
sys.exit(0)
def try_compile_re (arg):
"""
Try to compile the regular expression. On error print an error message
and exit.
"""
try:
return re.compile(arg)
except re.error, msg:
linkcheck.log.error(linkcheck.LOG_CMDLINE,
_("Syntax error in %r: %s", arg, msg))
sys.exit(1)
def has_encoding (encoding):
try:
codecs.lookup(encoding)
return True
except LookupError:
return False
class LCHelpFormatter (optparse.IndentedHelpFormatter):
"""
Help formatter indenting paragraph-wise.
"""
def format_option (self, option):
# The help for each option consists of two parts:
# * the opt strings and metavars
# eg. ("-x", or "-fFILENAME, --file=FILENAME")
# * the user-supplied help string
# eg. ("turn on expert mode", "read data from FILENAME")
#
# If possible, we write both of these on the same line:
# -x turn on expert mode
#
# But if the opt string list is too long, we put the help
# string on a second line, indented to the same column it would
# start in if it fit on the first line.
# -fFILENAME, --file=FILENAME
# read data from FILENAME
result = []
opts = self.option_strings[option]
opt_width = self.help_position - self.current_indent - 2
if len(opts) > opt_width:
opts = "%*s%s\n" % (self.current_indent, "", opts)
indent_first = self.help_position
else: # start help on same line as opts
opts = "%*s%-*s " % (self.current_indent, "", opt_width, opts)
indent_first = 0
result.append(opts)
if option.help:
text = linkcheck.strformat.wrap(option.help, self.help_width)
help_lines = text.splitlines()
result.append("%*s%s\n" % (indent_first, "", help_lines[0]))
result.extend(["%*s%s\n" % (self.help_position, "", line)
for line in help_lines[1:]])
elif opts[-1] != "\n":
result.append("\n")
return "".join(result)
class LCOptionParser (optparse.OptionParser, object):
"""
Option parser with custom help text layout.
"""
def __init__ (self):
"""
Initializing using our own help formatter class.
"""
super(LCOptionParser, self).__init__(formatter=LCHelpFormatter())
def error (self, msg):
"""
Print usage info and given message.
"""
print_usage(msg)
def get_usage (self):
"""
Return translated usage text.
"""
return Usage
def print_help (self, file=None):
"""
Print translated help text.
"""
s = u"%s\n%s\n%s\n%s" % (self.format_help(), Notes, Retval, Examples)
s = s.encode("iso-8859-1", "replace")
if os.name != 'posix':
linkcheck.strformat.paginate(s)
else:
print s
sys.exit(0)
# instantiate option parser and configure options
optparser = LCOptionParser()
################# general options ##################
group = optparse.OptionGroup(optparser, _("General options"))
group.add_option("-f", "--config", type="string", dest="configfile",
help=_(
"""Use CONFIGFILE as configuration file. As default LinkChecker first
searches /etc/linkchecker/linkcheckerrc and then ~/.linkchecker/linkcheckerrc
(under Windows <path-to-program>\\linkcheckerrc)."""))
group.add_option("-I", "--interactive", action="store_true",
dest="interactive", help=_(
"""Ask for url if none are given on the commandline."""))
group.add_option("-t", "--threads", type="int", dest="threads",
help=_(
"""Generate no more than num threads. Default number of threads is 10."""))
group.add_option("-V", "--version", action="store_true", dest="version",
help=_(
"""Print version and exit."""))
group.add_option("--priority", action="store_true", dest="priority",
help=_(
"""Run with normal thread scheduling priority. Per default LinkChecker
runs with low thread priority to be suitable as a background job."""))
group.add_option("--disable-psyco", action="store_false", dest="psyco",
default=True, help=_(
"""Do not use psyco runtime compilation even if it is installed."""))
optparser.add_option_group(group)
################# output options ##################
group = optparse.OptionGroup(optparser, _("Output options"))
group.add_option("-v", "--verbose", action="store_true", dest="verbose",
help=_(
"""Log all checked URLs (implies -w). Default is to log only invalid
URLs."""))
group.add_option("--no-warnings", action="store_false", dest="warnings",
help=_("""Don't log warnings. Default is to log warnings."""))
group.add_option("-W", "--warning-regex", type="string", dest="warningregex",
help=_(
"""Define a regular expression which prints a warning if it matches
any content of the checked link. This applies only to valid pages,
so we can get their content.
Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application Server error'. This option implies -w."""))
group.add_option("--warning-size-bytes", dest="warningsizebytes",
help=_(
"""Print a warning if content size is available and exceeds the given
number of bytes. This option implies -w."""))
group.add_option("-q", "--quiet", action="store_true", dest="quiet",
help=_(
"""Quiet operation. This is only useful with -F."""))
group.add_option("-o", "--output", type="string", dest="output",
metavar="TYPE[/ENCODING]",
help=_(
"""Specify output as %(loggertypes)s. Default output type is text.
ENCODING specifies the output encoding, the default is "iso-8859-15".
Valid encodings are listed at http://docs.python.org/lib/node127.html.""") % \
{'loggertypes': linkcheck.LoggerKeys})
group.add_option("-F", "--file-output", type="string", action="append",
dest="fileoutput", metavar="TYPE[/ENCODING][/FILENAME]",
help=_(
"""Output to a file linkchecker-out.TYPE, $HOME/.linkchecker/blacklist for
'blacklist' output, or FILENAME if specified.
ENCODING specifies the output encoding, the default is "iso-8859-15".
Valid encodings are listed at http://docs.python.org/lib/node127.html.
The FILENAME and ENCODING parts of the 'none' output type will be ignored,
else if the file already exists, it will be overwritten.
You can specify this option more than once. Valid file output TYPEs
are %(loggertypes)s. You can specify this option multiple times to output
to more than one file. Default is no file output. Note that you can
suppress all console output with the option '-o none'.""") % \
{'loggertypes': linkcheck.LoggerKeys})
group.add_option("--no-status", action="store_false", dest="status",
default=True, help=_(
"""Do not print check status messages."""))
group.add_option("-D", "--debug", type="string", action="append",
metavar="LOGGER",
help=_("""Print debugging output for given logger.
Available loggers are %(lognamelist)s.
Specifying 'all' is an alias for specifying all available loggers.
The option can be given multiple times to debug with more
than one logger.
For accurate results, threading and the psyco optimization module will
be disabled during debug runs.""") % \
{"lognamelist": linkcheck.lognamelist})
group.add_option("--profile", action="store_true", dest="profile",
help=_(
"""Write profiling data into a file named %s in the
current working directory. See also --viewprof.""") % _profile)
group.add_option("--viewprof", action="store_true", dest="viewprof",
help=_(
"""Print out previously generated profiling data. See also --profile."""))
optparser.add_option_group(group)
################# checking options ##################
group = optparse.OptionGroup(optparser, _("Checking options"))
group.add_option("-r", "--recursion-level", type="int", dest="recursionlevel",
help=_(
"""Check recursively all links up to given depth. A negative depth
will enable inifinite recursion. Default depth is infinite."""))
group.add_option("--no-follow-url", type="string", action="append",
dest="extern", help=_(
"""Check but do not recurse into URLs matching the given regex."""))
group.add_option("--ignore-url", type="string", action="append",
dest="externstrict", help=_(
"""Only check syntax of URLs matching the given regex."""))
group.add_option("-C", "--cookies", action="store_true", dest="cookies",
help=_(
"""Accept and send HTTP cookies according to RFC 2109. Only cookies
which are sent back to the originating server are accepted.
Sent and accepted cookies are provided as additional logging
information."""))
group.add_option("-a", "--anchors", action="store_true", dest="anchors",
help=_(
"""Check HTTP anchor references. Default is don't check anchors.
This option implies -w because anchor errors are always warnings."""))
group.add_option("--no-anchor-caching", action="store_false",
dest="anchorcaching", help=_(
"""Treat url#anchora and url#anchorb as equal on caching. This
is the default browser behaviour, but it's not specified in
the URI specification. Use with care."""))
group.add_option("-u", "--user", type="string", dest="username",
help=_(
"""Try given username for HTTP and FTP authorization.
For FTP the default username is 'anonymous'. See also -p."""))
group.add_option("-p", "--password", type="string", dest="password",
help=_(
"""Try given password for HTTP and FTP authorization.
For FTP the default password is 'anonymous@'. See also -u."""))
group.add_option("--timeout", type="int", dest="timeout",
help=_(
"""Set the timeout for TCP connection attempts in seconds. The default
timeout is %d seconds.""") % default_timeout)
group.add_option("-P", "--pause", type="int", dest="pause",
help=_(
"""Pause PAUSE seconds between each url check. This option implies -t0.
Default is no pause between requests."""))
group.add_option("-N", "--nntp-server", type="string", dest="nntpserver",
help=_(
"""Specify an NNTP server for 'news:...' links. Default is the
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked."""))
group.add_option("--no-proxy-for", type="string", action="append",
dest="noproxyfor", help=_(
"""Contact hosts that match the given expression directly instead of
going through a proxy."""))
optparser.add_option_group(group)
################# auto completion #####################
if has_optcomplete:
optcomplete.autocomplete(optparser)
# read and parse command line options and arguments
(options, args) = optparser.parse_args()
# build a config object for this check session
config = linkcheck.configuration.Configuration()
# initialize logging
if options.debug:
allowed_debugs = linkcheck.lognames.keys()
for _name in options.debug:
if _name not in allowed_debugs:
print_usage(_("Invalid debug level %(level)r") % {'level': _name})
# disable psyco if debugging is enabled to prevent that stack lists
# have PsycoFrame objects instead of types.FrameType
options.psyco = False
config.init_logging(debug=options.debug)
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "Python %s on %s",
sys.version, sys.platform)
# read configuration files
try:
if options.configfile:
config.read(files=[options.configfile])
else:
config.read()
except linkcheck.LinkCheckerError, msg:
# config error
print_usage(str(msg))
# apply commandline options and arguments
constructauth = False
do_profile = False
if not options.priority:
import linkcheck.threader
linkcheck.threader.set_thread_priority(linkcheck.threader.PRIO_LOW)
if options.warnings is not None:
config["warnings"] = options.warnings
if options.anchors is not None:
config["anchors"] = options.anchors
config["warnings"] = True
if options.extern:
pats = [linkcheck.get_link_pat(arg) for arg in options.extern]
config["externlinks"].extend(pats)
if options.externstrict:
pats = [linkcheck.get_link_pat(arg, strict=True) \
for arg in options.externstrict]
config["externlinks"].extend(pats)
if options.noproxyfor:
ros = [try_compile_re(arg) for arg in options.noproxyfor]
config["noproxyfor"].extend(ros)
if options.output:
if "/" in options.output:
logtype, encoding = options.output.split("/", 1)
else:
logtype, encoding = options.output, "iso-8859-15"
if not linkcheck.Loggers.has_key(logtype.lower()):
print_usage(_("Unknown logger type %r in %r for option %s") % \
(logtype, options.output, "'-o, --output'"))
if logtype != 'none' and not has_encoding(encoding):
print_usage(_("Unknown encoding %r in %r for option %s") % \
(encoding, options.output, "'-o, --output'"))
config['logger'] = config.logger_new(logtype.lower(), encoding=encoding)
if options.fileoutput:
ns = {'fileoutput': 1}
for arg in options.fileoutput:
ftype = arg
# look for (optional) filename and encoding
if '/' in ftype:
ftype, suffix = ftype.split('/', 1)
if suffix:
if has_encoding(suffix):
# it was an encoding
ns['encoding'] = suffix
elif '/' in suffix:
# look for (optional) encoding
encoding, filename = suffix.split('/', 1)
if has_encoding(encoding):
ns['encoding'] = encoding
ns['filename'] = filename
else:
ns['filename'] = suffix
else:
ns['filename'] = suffix
if not linkcheck.Loggers.has_key(ftype):
print_usage(_("Unknown logger type %r in %r for option %s") % \
(ftype, options.output, "'-F, --file-output'"))
if ftype != 'none' and 'encoding' in ns and \
not has_encoding(ns['encoding']):
print_usage(_("Unknown encoding %r in %r for option %s") % \
ns['encoding'], options.output, "'-F, --file-output'")
# generating loggers with fileoutput can throw
# an exception when opening the file
try:
logger = config.logger_new(ftype, **ns)
except OSError, msg:
print_usage(_("Illegal argument %r for option %s: %s") % \
(arg, "'-F, --file-output'", str(msg)))
config['fileoutput'].append(logger)
if options.interactive is not None:
config['interactive'] = options.interactive
if options.nntpserver:
config["nntpserver"] = options.nntpserver
if options.anchorcaching is not None:
config["anchorcaching"] = options.anchorcaching
if options.password is not None:
_password = options.password
constructauth = True
if options.pause is not None:
if options.pause >= 0:
config["wait"] = options.pause
else:
print_usage(_("Illegal argument %d for option %s") % \
(options.pause, "'-P, --pause'"))
if options.profile is not None:
do_profile = options.profile
if options.quiet is not None:
config['logger'] = config.logger_new('none')
if options.recursionlevel is not None:
config["recursionlevel"] = options.recursionlevel
if options.status is not None:
config['status'] = options.status
if options.threads is not None:
if options.threads < 1:
print_usage(_("Illegal argument %d for option %s") % \
(options.threads, "'-t, --threads'"))
config["threads"] = options.threads
if options.timeout is not None:
if options.timeout > 0:
socket.setdefaulttimeout(options.timeout)
else:
print_usage(_("Illegal argument %r for option %s") % \
(options.timeout, "'--timeout'"))
if options.username is not None:
_username = options.username
constructauth = True
if options.version is not None:
print_version()
if options.verbose is not None:
if options.verbose:
config["verbose"] = True
config["warnings"] = True
if options.viewprof:
viewprof()
if options.warningregex is not None:
config["warningregex"] = try_compile_re(options.warningregex)
config["warnings"] = True
if options.warningsizebytes is not None:
config["warnsizebytes"] = options.warningsizebytes
if options.cookies is not None:
config['cookies'] = options.cookies
if constructauth:
config["authentication"].append({'pattern': try_compile_re(".+"),
'user': _username,
'password': _password})
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "configuration: %s",
pprint.pformat(config.items()))
# warn about sitemap loggers and verbose output
klasses = [c.__class__ for c in [config['logger']] + config['fileoutput']]
if (linkcheck.logger.gml.GMLLogger in klasses or \
linkcheck.logger.dot.DOTLogger in klasses) and not config['verbose']:
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
_("Using DOT or GML loggers without verbose output"
" gives an incomplete sitemap graph."))
# interactive input
if len(args) <= 0:
if config['interactive']:
urls = raw_input(
_("enter one or more URLs, separated by white-space\n--> "))
args = urls.split()
else:
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given"))
# initialize the cache and the consumer model
cache = linkcheck.checker.cache.Cache()
consumer = linkcheck.checker.consumer.Consumer(config, cache)
# syntactic sugar
for url in args:
if url.lower().startswith("www."):
url = "http://%s" % url
elif url.lower().startswith("ftp."):
url = "ftp://%s" % url
url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
# add to consumer queue
consumer.append_url(url_data)
############################# check the URLs ################################
if do_profile and not has_profile:
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
_("The `profile' Python module is not installed,"
" therefore the --profile option is disabled."))
if do_profile and has_profile:
run = True
if os.path.exists(_profile):
question = _("Overwrite profiling file %r?\n"
"Press Ctrl-C to cancel, RETURN to continue.") % _profile
try:
raw_input(question)
except KeyboardInterrupt:
sys.stderr.write(os.linesep)
sys.stderr.write(_("Canceled."))
sys.stderr.write(os.linesep)
run = False
if run:
profile.run("linkcheck.checker.check_urls(consumer)", _profile)
elif options.psyco:
try:
import psyco
# psyco >= 1.4.0 final is needed
if psyco.__version__ >= 0x10400f0:
psyco.profile()
else:
# warn about old psyco version
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
_("Psyco is installed but not used since the version is too old.\n"
"Psyco >= 1.4 is needed."))
except ImportError:
# no psyco available, just ignore
pass
linkcheck.checker.check_urls(consumer)
#############################################################################
# interactive input end
if config['interactive']:
raw_input(_("Hit RETURN to finish"))
# if errors or warnings are encountered, exit with non-zero status
if consumer.logger.errors or \
(consumer.logger.warnings and config['warnings']):
sys.exit(1)