linkchecker/linkchecker
2004-08-31 21:38:21 +00:00

509 lines
20 KiB
Python
Executable file

#!/usr/bin/python -O
# -*- coding: iso-8859-1 -*-
"""check HTML pages for broken links"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# imports and checks
import sys
if not hasattr(sys, 'version_info') or \
sys.version_info < (2, 3, 0, 'final', 0):
raise SystemExit, "This program requires Python 2.3 or later."
import getopt
import re
import os
import pprint
import socket
import optparse
# set default 60 seconds timeout
default_timeout = 60
socket.setdefaulttimeout(default_timeout)
import linkcheck
import linkcheck.log
import linkcheck.optcomplete
import linkcheck.checker
import linkcheck.checker.cache
import linkcheck.checker.consumer
import linkcheck.configuration
from linkcheck.i18n import _
# default profiling filename
_profile = "linkchecker.prof"
_username = None
_password = None
# main usage text
Usage = _("""USAGE\tlinkchecker [options] file-or-url...
""")
Notes = _("""NOTES
o A ! before a regular expression negates it. So '!^mailto:' matches
everything but a mailto link.
o URLs on the command line starting with "ftp." are treated like
"ftp://ftp.", URLs starting with "www." are treated like "http://www.".
You can also give local files as arguments.
o If you have your system configured to automatically establish a
connection to the internet (e.g. with diald), it will connect when
checking links not pointing to your local system.
See the --extern-strict-all option on how to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker disables it
automatically.
o You can supply multiple user/password pairs in a configuration file.
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
on Unix or Windows.
On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
same as the host of the user browsing your pages!
""")
Examples = _(r"""EXAMPLES
The most common use checks the given domain recursively, plus any
single URL pointing outside of the domain:
linkchecker http://treasure.calvinsplayground.de/
Beware that this checks the whole site which can have several hundred
thousands URLs. Use the -r option to restrict the recursion depth.
Don't connect to mailto: hosts, only check their URL syntax. All other
links are checked as usual:
linkchecker --intern='!^mailto:' --extern-strict-all www.mysite.org
Checking a local HTML file on Unix:
linkchecker ../bla.html
Checking a local HTML file on Windows:
linkchecker c:\temp\test.html
You can skip the "http://" url part if the domain starts with "www.":
linkchecker www.myhomepage.de
You can skip the "ftp://" url part if the domain starts with "ftp.":
linkchecker -r0 ftp.linux.org
""")
def printVersion ():
"""print the program version and exit"""
print linkcheck.configuration.AppInfo
sys.exit(0)
def printUsage (msg):
"""print a program msg text to stderr and exit"""
sys.stderr.write(_("Error: %s") % msg)
sys.stderr.write(os.linesep)
sys.stderr.write(_("Execute 'linkchecker -h' for help"))
sys.stderr.write(os.linesep)
sys.exit(1)
def viewprof ():
"""print profiling data and exit"""
if not os.path.exists(_profile):
sys.stderr.write(_("Could not find profiling file %s.")%_profile)
sys.stderr.write(os.linesep)
sys.stderr.write(_("Please run linkchecker with --profile to generate it."))
sys.stderr.write(os.linesep)
sys.exit(1)
import pstats
stats = pstats.Stats(_profile)
stats.strip_dirs().sort_stats("cumulative").print_stats(50)
sys.exit(0)
class LCHelpFormatter (optparse.IndentedHelpFormatter):
"""help formatter indenting paragraph-wise"""
def format_option (self, option):
# The help for each option consists of two parts:
# * the opt strings and metavars
# eg. ("-x", or "-fFILENAME, --file=FILENAME")
# * the user-supplied help string
# eg. ("turn on expert mode", "read data from FILENAME")
#
# If possible, we write both of these on the same line:
# -x turn on expert mode
#
# But if the opt string list is too long, we put the help
# string on a second line, indented to the same column it would
# start in if it fit on the first line.
# -fFILENAME, --file=FILENAME
# read data from FILENAME
result = []
opts = option.option_strings
opt_width = self.help_position - self.current_indent - 2
if len(opts) > opt_width:
opts = "%*s%s\n" % (self.current_indent, "", opts)
indent_first = self.help_position
else: # start help on same line as opts
opts = "%*s%-*s " % (self.current_indent, "", opt_width, opts)
indent_first = 0
result.append(opts)
if option.help:
text = linkcheck.strformat.wrap(option.help, self.help_width)
help_lines = text.splitlines()
print help_lines
result.append("%*s%s\n" % (indent_first, "", help_lines[0]))
result.extend(["%*s%s\n" % (self.help_position, "", line)
for line in help_lines[1:]])
elif opts[-1] != "\n":
result.append("\n")
return "".join(result)
class LCOptionParser (optparse.OptionParser, object):
"""option parser with custom help text layout"""
def __init__ (self):
# use our own formatter
super(LCOptionParser, self).__init__(formatter=LCHelpFormatter())
def error (self, msg):
"""print usage info and given message"""
printUsage(msg)
def get_usage (self):
"""return translated usage text"""
return Usage
def print_help (self, file=None):
"""print translated help text"""
s = "%s\n%s\n%s"%(self.format_help(), Notes, Examples)
if os.name!='posix':
linkcheck.StringUtil.paginate(s)
else:
print s
sys.exit(0)
# instantiate option parser and configure options
optparser = LCOptionParser()
################# general options ##################
group = optparse.OptionGroup(optparser, _("General options"))
group.add_option("-f", "--config", type="string", dest="configfile",
help=_(
"""Use file as configuration file. As default LinkChecker first
searches /etc/linkcheckerrc and then ~/.linkcheckerrc
(under Windows <path-to-program>\\linkcheckerrc)."""))
group.add_option("-I", "--interactive", action="store_true", dest="interactive",
help=_(
"""Ask for url if none are given on the commandline."""))
group.add_option("-t", "--threads", type="int", dest="threads",
help=_(
"""Generate no more than num threads. Default number of threads is 10."""))
group.add_option("-V", "--version", action="store_true", dest="version",
help=_(
"""Print version and exit."""))
optparser.add_option_group(group)
################# output options ##################
group = optparse.OptionGroup(optparser, _("Output options"))
group.add_option("-v", "--verbose", action="store_true", dest="verbose",
help=_(
"""Log all checked URLs (implies -w). Default is to log only invalid
URLs."""))
group.add_option("-w", "--warnings", action="store_true", dest="warnings",
help=_("""Log warnings."""))
group.add_option("-W", "--warning-regex", type="string", dest="warningregex",
help=_(
"""Define a regular expression which prints a warning if it matches
any content of the checked link. This applies only to valid pages,
so we can get their content.
Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application Server error'. This option implies -w."""))
group.add_option("--warning-size-bytes", dest="warningsizebytes",
help=_(
"""Print a warning if content size is available and exceeds the given
number of bytes. This option implies -w."""))
group.add_option("-q", "--quiet", action="store_true", dest="quiet",
help=_(
"""Quiet operation. This is only useful with -F."""))
group.add_option("-o", "--output", type="string", dest="output",
help=_(
"""Specify output as %(loggertypes)s. Default output type is text.""") % \
{'loggertypes': linkcheck.LoggerKeys})
group.add_option("-F", "--file-output", type="string", action="append",
dest="fileoutput", metavar="TYPE[/FILENAME]",
help=_(
"""Output to a file linkchecker-out.TYPE, $HOME/.linkchecker_blacklist for
'blacklist' output, or FILENAME if specified.
The FILENAME part of the 'none' output type will be ignored,
else if the file already exists, it will be overwritten.
You can specify this option more than once. Valid file output TYPEs
are %(loggertypes)s. You can specify this option multiple times to output
to more than one file. Default is no file output.""") % \
{'loggertypes': linkcheck.LoggerKeys})
group.add_option("--no-status", action="store_false", dest="status",
default=True, help=_(
"""Do not print check status messages."""))
group.add_option("-D", "--debug", type="string", action="append",
metavar="LOGGER",
help=_("""Print debugging output for given logger.
Available loggers are %(lognamelist)s.
Specifying 'all' is an alias for specifying all available loggers.
The option can be given multiple times to debug with more
than one logger.
For accurate results, threading will be disabled during debug runs.""") %\
{"lognamelist": linkcheck.lognamelist})
group.add_option("--profile", action="store_true", dest="profile",
help=_(
"""Write profiling data into a file named %s in the
current working directory. See also --viewprof.""")%_profile)
group.add_option("--viewprof", action="store_true", dest="viewprof",
help=_(
"""Print out previously generated profiling data. See also --profile."""))
optparser.add_option_group(group)
################# checking options ##################
group = optparse.OptionGroup(optparser, _("Checking options"))
group.add_option("-r", "--recursion-level", type="int", dest="recursionlevel",
help=_(
"""Check recursively all links up to given depth. A negative depth
will enable inifinite recursion. Default depth is infinite."""))
group.add_option("-i", "--intern", type="string", action="append", dest="intern",
help=_(
""" regex, --intern=regex
Assume URLs that match the given expression as internal.
LinkChecker descends recursively only to internal URLs, not to
external."""))
group.add_option("-e", "--extern", type="string", action="append", dest="extern",
help=_(
"""Assume urls that match the given expression as external.
Only internal HTML links are checked recursively."""))
group.add_option("-s", "--extern-strict-all", action="store_true",
dest="externstrictall", help=_(
"""Check only syntax of external links, do not try to connect to them.
For local file urls, only local files are internal. For
http and ftp urls, all urls at the same domain name are internal."""))
group.add_option("-d", "--denyallow", action="store_true", dest="denyallow",
help=_(
"""Swap checking order to external/internal. Default checking order
is internal/external."""))
group.add_option("-C", "--cookies", action="store_true", dest="cookies",
help=_(
"""Accept and send HTTP cookies according to RFC 2109. Only cookies
which are sent back to the originating server are accepted.
Sent and accepted cookies are provided as additional logging
information."""))
group.add_option("-a", "--anchors", action="store_true", dest="anchors",
help=_(
"""Check HTTP anchor references. This option applies to both internal
and external urls. Default is don't check anchors.
This option implies -w because anchor errors are always warnings."""))
group.add_option("--no-anchor-caching", action="store_false", dest="anchorcaching",
help=_(
"""Treat url#anchora and url#anchorb as equal on caching. This
is the default browser behaviour, but it's not specified in
the URI specification. Use with care."""))
group.add_option("-u", "--user", type="string", dest="username",
help=_(
"""Try given username for HTTP and FTP authorization.
For FTP the default username is \fBanonymous\fP. See also -p."""))
group.add_option("-p", "--password", type="string", dest="password",
help=_(
"""Try given password for HTTP and FTP authorization.
For FTP the default password is 'anonymous@'. See also -u."""))
group.add_option("--timeout", type="int", dest="timeout",
help=_(
"""Set the timeout for TCP connection attempts in seconds. The default
timeout is %d seconds.""") % default_timeout)
group.add_option("-P", "--pause", type="int", dest="pause",
help=_(
"""Pause <secs> seconds between each url check. This option implies -t0.
Default is no pause between requests."""))
group.add_option("-N", "--nntp-server", type="string", dest="nntpserver",
help=_(
"""Specify an NNTP server for 'news:...' links. Default is the
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked."""))
optparser.add_option_group(group)
################# deprecated options ##################
group.add_option("--status", action="store_true", dest="status",
help=_(
"""Print check status every 5 seconds to stderr. When --debug is
given, the status check will only be printed after each checked url.
This is due to disabled threading during a debug run."""))
optparser.add_option_group(group)
################# auto completion #####################
linkcheck.optcomplete.autocomplete(optparser)
if "--wischiwaschi" in sys.argv:
import linkcheck.util1
linkcheck.util1.abbuzze()
sys.exit(0)
# read and parser command line options and arguments
(options, args) = optparser.parse_args()
# build a config object for this check session, also initializes logging
config = linkcheck.configuration.Configuration()
# init logging
config.init_logging(debug=options.debug)
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "Python %s on %s", sys.version, sys.platform)
# read configuration from config files
configfiles = []
if options.configfile:
configfiles.append(options.configfile)
config.read(configfiles)
# apply commandline options and arguments
constructauth = False
do_profile = False
if options.anchors is not None:
config["anchors"] = options.anchors
config["warnings"] = True
if options.extern:
pats = [linkcheck.get_link_pat(arg) for arg in options.extern]
config["externlinks"].extend(pats)
if options.output:
if linkcheck.Loggers.has_key(options.output):
config['logger'] = config.logger_new(options.output)
else:
printUsage(_("Illegal argument %r for option %s") % \
(options.output, "'-o, --output'"))
if options.fileoutput:
ns = {'fileoutput': 1}
for ftype in options.fileoutput:
try:
ftype, ns['filename'] = ftype.split('/', 1)
if not ns['filename']:
raise ValueError
except ValueError:
pass
if linkcheck.Loggers.has_key(ftype):
config['fileoutput'].append(config.logger_new(ftype, **ns))
else:
printUsage(_("Illegal argument %r for option %s") % \
(ftype, "'-F, --file-output'"))
if options.interactive is not None:
config['interactive'] = options.interactive
if options.intern:
pats = [linkcheck.get_link_pat(arg) for arg in options.intern]
config["internlinks"].extend(pats)
if options.denyallow is not None:
config["denyallow"] = options.denyallow
if options.nntpserver:
config["nntpserver"] = options.nntpserver
if options.anchorcaching is not None:
config["anchorcaching"] = options.anchorcaching
if options.password is not None:
_password = options.password
constructauth = True
if options.pause is not None:
if options.pause >= 0:
config["wait"] = options.pause
else:
printUsage(_("Illegal argument %d for option %s") % \
(options.pause, "'-P, --pause'"))
if options.profile is not None:
do_profile = options.profile
if options.quiet is not None:
config["quiet"] = options.quiet
if options.recursionlevel is not None:
config["recursionlevel"] = options.recursionlevel
if options.externstrictall is not None:
config["externstrictall"] = options.externstrictall
if options.status is not None:
config['status'] = options.status
if options.threads is not None:
if options.threads < 1:
printUsage(_("Illegal argument %d for option %s") % \
(options.threads, "'-t, --threads'"))
config.set_threads(options.threads)
if options.timeout is not None:
if options.timeout > 0:
socket.setdefaulttimeout(options.timeout)
else:
printUsage(_("Illegal argument %r for option %s") % \
(options.timeout, "'--timeout'"))
if options.username is not None:
_username = options.username
constructauth = True
if options.version is not None:
printVersion()
if options.verbose is not None:
if options.verbose:
config["verbose"] = True
config["warnings"] = True
if options.viewprof:
viewprof()
if options.warnings is not None:
config["warnings"] = options.warnings
if options.warningregex is not None:
config["warningregex"] = re.compile(options.warningregex)
config["warnings"] = True
if options.warningsizebytes is not None:
config["warnsizebytes"] = options.warningsizebytes
if options.cookies is not None:
config['cookies'] = options.cookies
if constructauth:
config["authentication"].append({'pattern': re.compile(".+"),
'user': _username,
'password': _password})
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "configuration: %s",
pprint.pformat(config.items()))
# interactive input
if len(args) <= 0:
if config['interactive']:
urls = raw_input(_("enter one or more urls, separated by white-space\n--> "))
args = urls.split()
else:
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or urls given"))
cache = linkcheck.checker.cache.Cache()
consumer = linkcheck.checker.consumer.Consumer(config, cache)
# syntactic sugar
for url in args:
if url.lower().startswith("www."):
url = "http://%s"%url
elif url.lower().startswith("ftp."):
url = "ftp://%s"%url
url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
consumer.append_url(url_data)
############################# check the urls ################################
if do_profile:
import profile
profile.run("linkcheck.checker.check_urls(consumer)", _profile)
else:
# do not use psyco, at the moment (Oct 2003) it has bugs causing
# infinite loops when threads are enabled, and psyco disables
# the Ctrl-C break button of the Python interpreter.
#try:
# import psyco
# psyco.full()
#except ImportError:
# pass
linkcheck.checker.check_urls(consumer)
#############################################################################
# interactive input end
if config['interactive']:
raw_input(_("Hit RETURN to finish"))
# if errors are encountered, exit with non-zero status
# same applies to warnings when --warnings options was given
if consumer.errors or (consumer.warnings and config['warnings']):
sys.exit(1)