mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-19 07:20:26 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3474 e7d03fd6-7b0d-0410-9947-9c21f3af8025
764 lines
29 KiB
Python
Executable file
764 lines
29 KiB
Python
Executable file
#!/usr/bin/python2.4
|
|
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2000-2006 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
"""
|
|
Check HTML pages for broken links.
|
|
"""
|
|
|
|
import sys
|
|
import codecs
|
|
import re
|
|
import os
|
|
import pprint
|
|
import socket
|
|
import optparse
|
|
|
|
import linkcheck
|
|
# override optparse gettext method with the one from linkcheck.init_i18n()
|
|
optparse._ = _
|
|
# now import the rest of the linkchecker gang
|
|
import linkcheck.log
|
|
import linkcheck.i18n
|
|
import linkcheck.checker
|
|
import linkcheck.director
|
|
import linkcheck.configuration
|
|
import linkcheck.fileutil
|
|
import linkcheck.strformat
|
|
# optional modules
|
|
has_optcomplete = linkcheck.fileutil.has_module("optcomplete")
|
|
has_profile = linkcheck.fileutil.has_module("profile")
|
|
has_pstats = linkcheck.fileutil.has_module("pstats")
|
|
|
|
# default profiling filename
|
|
_profile = "linkchecker.prof"
|
|
_username = None
|
|
_password = None
|
|
|
|
# main usage text
|
|
Usage = _("""USAGE\tlinkchecker [options] file-or-url...
|
|
""")
|
|
|
|
Notes = _("""NOTES
|
|
o URLs on the command line starting with "ftp." are treated like
|
|
"ftp://ftp.", URLs starting with "www." are treated like "http://www.".
|
|
You can also give local files as arguments.
|
|
o If you have your system configured to automatically establish a
|
|
connection to the internet (e.g. with diald), it will connect when
|
|
checking links not pointing to your local system.
|
|
See the --ignore-url option on how to prevent this.
|
|
o Javascript links are currently ignored.
|
|
o If your platform does not support threading, LinkChecker disables it
|
|
automatically.
|
|
o You can supply multiple user/password pairs in a configuration file.
|
|
o When checking 'news:' links the given NNTP host doesn't need to be the
|
|
same as the host of the user browsing your pages.
|
|
""")
|
|
|
|
ProxySupport = _("""PROXY SUPPORT
|
|
To use a proxy set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
|
|
on Unix or Windows to the proxy URL. The URL should be of the form
|
|
"http://[<user>:<pass>@]<host>[:port]", for example
|
|
"http://localhost:8080", or "http://joe:test@proxy.domain".
|
|
On a Mac use the Internet Config to select a proxy.
|
|
""")
|
|
|
|
RegularExpressions = _("""REGULAR EXPRESSIONS
|
|
Only Python regular expressions are accepted by LinkChecker.
|
|
See http://www.amk.ca/python/howto/regex/ for an introduction in
|
|
regular expressions.
|
|
|
|
The only addition is that a leading exclamation mark negates
|
|
the regular expression.
|
|
""")
|
|
|
|
CookieFormat = _("""COOKIE FILES
|
|
A cookie file contains standard RFC 805 header data with the following
|
|
possible names:
|
|
Scheme (optional)
|
|
Sets the scheme the cookies are valid for; default scheme is 'http'.
|
|
Host (required)
|
|
Sets the domain the cookies are valid for.
|
|
Path (optional)
|
|
Gives the path the cookies are value for; default path is '/'.
|
|
Set-cookie (optional)
|
|
Set cookie name/value. Can be given more than once.
|
|
|
|
Multiple entries are separated by a blank line.
|
|
|
|
The example below will send two cookies to all URLs starting with
|
|
'http://imadoofus.org/hello/' and one to all URLs starting
|
|
with 'https://imaweevil.org/':
|
|
|
|
Host: imadoofus.org
|
|
Path: /hello
|
|
Set-cookie: ID="smee"
|
|
Set-cookie: spam="egg"
|
|
|
|
Scheme: https
|
|
Host: imaweevil.org
|
|
Set-cookie: baggage="elitist"; comment="hologram"
|
|
""")
|
|
|
|
Retval = _(r"""RETURN VALUE
|
|
The return value is non-zero when
|
|
o invalid links were found or
|
|
o warnings were found warnings are enabled
|
|
o a program error occurred
|
|
""")
|
|
|
|
Examples = _(r"""EXAMPLES
|
|
The most common use checks the given domain recursively, plus any
|
|
single URL pointing outside of the domain:
|
|
linkchecker http://treasure.calvinsplayground.de/
|
|
Beware that this checks the whole site which can have several hundred
|
|
thousands URLs. Use the -r option to restrict the recursion depth.
|
|
|
|
Don't connect to mailto: hosts, only check their URL syntax. All other
|
|
links are checked as usual:
|
|
linkchecker --ignore-url=^mailto: www.mysite.org
|
|
|
|
Checking a local HTML file on Unix:
|
|
linkchecker ../bla.html
|
|
|
|
Checking a local HTML file on Windows:
|
|
linkchecker c:\temp\test.html
|
|
|
|
You can skip the "http://" url part if the domain starts with "www.":
|
|
linkchecker www.myhomepage.de
|
|
|
|
You can skip the "ftp://" url part if the domain starts with "ftp.":
|
|
linkchecker -r0 ftp.linux.org
|
|
""")
|
|
|
|
LoggerTypes = _(r"""OUTPUT TYPES
|
|
Note that by default only errors and warnings are logged.
|
|
You should use the --verbose option to get the complete URL list,
|
|
especially when outputting a sitemap graph format.
|
|
|
|
text Standard text output, logging URLs in keyword: argument fashion.
|
|
html Log URLs in keyword: argument fashion, formatted as HTML.
|
|
Additionally has links to the referenced pages. Invalid URLs have
|
|
HTML and CSS syntax check links appended.
|
|
csv Log check result in CSV format with one URL per line.
|
|
gml Log parent-child relations between linked URLs as a GML sitemap
|
|
graph.
|
|
dot Log parent-child relations between linked URLs as a DOT sitemap
|
|
graph.
|
|
gxml Log check result as a GraphXML sitemap graph.
|
|
xml Log check result as machine-readable XML.
|
|
sql Log check result as SQL script with INSERT commands. An example
|
|
script to create the initial SQL table is included as create.sql.
|
|
blacklist
|
|
Suitable for cron jobs. Logs the check result into a file
|
|
~/.linkchecker/blacklist which only contains entries with invalid
|
|
URLs and the number of times they have failed.
|
|
none Logs nothing. Suitable for debugging or checking the exit code.
|
|
""")
|
|
|
|
Warnings = _(r"""IGNORE WARNINGS
|
|
The following warnings are recognized in the ignorewarnings config
|
|
file entry:
|
|
""")
|
|
for tag, desc in linkcheck.checker.Warnings.items():
|
|
Warnings += u" o %s\n %s\n" % (tag, desc)
|
|
|
|
def encode (s, codec="iso8859-15"):
|
|
"""
|
|
Encode string with given codec for screen print.
|
|
"""
|
|
return s.encode(codec, "ignore")
|
|
|
|
|
|
def print_version ():
|
|
"""
|
|
Print the program version and exit.
|
|
"""
|
|
print encode(linkcheck.configuration.AppInfo)
|
|
sys.exit(0)
|
|
|
|
|
|
def print_usage (msg):
|
|
"""
|
|
Print a program msg text to stderr and exit.
|
|
"""
|
|
print >>sys.stderr, encode(_("Error: %s") % msg)
|
|
print >>sys.stderr, encode(_("Execute 'linkchecker -h' for help"))
|
|
sys.exit(1)
|
|
|
|
|
|
def check_user ():
|
|
if os.name != 'posix':
|
|
return
|
|
if os.geteuid() == 0:
|
|
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
|
|
_("Running as root, dropping to nobody."))
|
|
import pwd
|
|
os.seteuid(pwd.getpwnam('nobody')[3])
|
|
|
|
|
|
def viewprof ():
|
|
"""
|
|
Print profiling data and exit.
|
|
"""
|
|
if not has_pstats:
|
|
linkcheck.log.error(linkcheck.LOG_CMDLINE,
|
|
_("The `pstats' Python module is not installed,"
|
|
" therefore the --viewprof option is disabled."))
|
|
sys.exit(1)
|
|
if not os.path.isfile(_profile):
|
|
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
|
|
_("Could not find profiling file %r.") % _profile)
|
|
print >>sys.stderr, \
|
|
_("Please run linkchecker with --profile to generate it.")
|
|
sys.exit(1)
|
|
import pstats
|
|
stats = pstats.Stats(_profile)
|
|
stats.strip_dirs().sort_stats("cumulative").print_stats(100)
|
|
sys.exit(0)
|
|
|
|
|
|
def try_compile_re (arg):
|
|
"""
|
|
Try to compile the regular expression. On error print an error message
|
|
and exit.
|
|
"""
|
|
try:
|
|
return re.compile(arg)
|
|
except re.error, msg:
|
|
linkcheck.log.error(linkcheck.LOG_CMDLINE,
|
|
_("Syntax error in %(arg)r: %(msg)s") % {"arg": arg, "msg": msg})
|
|
sys.exit(1)
|
|
|
|
|
|
def has_encoding (encoding):
|
|
try:
|
|
codecs.lookup(encoding)
|
|
return True
|
|
except LookupError:
|
|
return False
|
|
|
|
|
|
class LCHelpFormatter (optparse.IndentedHelpFormatter):
|
|
"""
|
|
Help formatter indenting paragraph-wise.
|
|
"""
|
|
|
|
def format_option (self, option):
|
|
# The help for each option consists of two parts:
|
|
# * the opt strings and metavars
|
|
# eg. ("-x", or "-fFILENAME, --file=FILENAME")
|
|
# * the user-supplied help string
|
|
# eg. ("turn on expert mode", "read data from FILENAME")
|
|
|
|
# If possible, we write both of these on the same line:
|
|
# -x turn on expert mode
|
|
|
|
# But if the opt string list is too long, we put the help
|
|
# string on a second line, indented to the same column it would
|
|
# start in if it fit on the first line.
|
|
# -fFILENAME, --file=FILENAME
|
|
# read data from FILENAME
|
|
result = []
|
|
opts = self.option_strings[option]
|
|
opt_width = self.help_position - self.current_indent - 2
|
|
if len(opts) > opt_width:
|
|
opts = "%*s%s\n" % (self.current_indent, "", opts)
|
|
indent_first = self.help_position
|
|
else: # start help on same line as opts
|
|
opts = "%*s%-*s " % (self.current_indent, "", opt_width, opts)
|
|
indent_first = 0
|
|
result.append(opts)
|
|
if option.help:
|
|
text = linkcheck.strformat.wrap(option.help, self.help_width)
|
|
help_lines = text.splitlines()
|
|
result.append("%*s%s\n" % (indent_first, "", help_lines[0]))
|
|
result.extend(["%*s%s\n" % (self.help_position, "", line)
|
|
for line in help_lines[1:]])
|
|
elif opts[-1] != "\n":
|
|
result.append("\n")
|
|
return "".join(result)
|
|
|
|
|
|
class LCOptionParser (optparse.OptionParser, object):
|
|
"""
|
|
Option parser with custom help text layout.
|
|
"""
|
|
|
|
def __init__ (self):
|
|
"""
|
|
Initializing using our own help formatter class.
|
|
"""
|
|
super(LCOptionParser, self).__init__(formatter=LCHelpFormatter())
|
|
|
|
def error (self, msg):
|
|
"""
|
|
Print usage info and given message.
|
|
"""
|
|
print_usage(msg)
|
|
|
|
def get_usage (self):
|
|
"""
|
|
Return translated usage text.
|
|
"""
|
|
return Usage
|
|
|
|
def print_help (self, file=None):
|
|
"""
|
|
Print translated help text.
|
|
"""
|
|
s = u"%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s" % (self.format_help(),
|
|
Examples, LoggerTypes, RegularExpressions, CookieFormat,
|
|
ProxySupport, Notes, Retval, Warnings)
|
|
s = s.encode(linkcheck.i18n.default_encoding, "replace")
|
|
if os.name != 'posix':
|
|
linkcheck.strformat.paginate(s)
|
|
else:
|
|
print s
|
|
sys.exit(0)
|
|
|
|
# instantiate option parser and configure options
|
|
optparser = LCOptionParser()
|
|
|
|
# build a config object for this check session
|
|
config = linkcheck.configuration.Configuration()
|
|
|
|
################# general options ##################
|
|
group = optparse.OptionGroup(optparser, _("General options"))
|
|
group.add_option("-f", "--config", type="string", dest="configfile",
|
|
metavar="FILENAME",
|
|
help=_(
|
|
"""Use FILENAME as configuration file. Per default LinkChecker first
|
|
searches /etc/linkchecker/linkcheckerrc and then ~/.linkchecker/linkcheckerrc
|
|
(under Windows <path-to-program>\\linkcheckerrc)."""))
|
|
group.add_option("-I", "--interactive", action="store_true",
|
|
dest="interactive", help=_(
|
|
"""Ask for URL if none are given on the commandline."""))
|
|
group.add_option("-t", "--threads", type="int", dest="threads",
|
|
metavar="NUMBER",
|
|
help=_(
|
|
"""Generate no more than the given number of threads. Default number
|
|
of threads is 10. To disable threading specify a non-positive number."""))
|
|
group.add_option("--priority", action="store_true", dest="priority",
|
|
help=_(
|
|
"""Run with normal thread scheduling priority. Per default LinkChecker
|
|
runs with low thread priority to be suitable as a background job."""))
|
|
group.add_option("-V", "--version", action="store_true", dest="version",
|
|
help=_("""Print version and exit."""))
|
|
group.add_option("--allow-root", action="store_true", dest="allowroot",
|
|
default=False, help=_(
|
|
"""Do not drop privileges when running as root user on Unix systems."""))
|
|
optparser.add_option_group(group)
|
|
|
|
################# output options ##################
|
|
group = optparse.OptionGroup(optparser, _("Output options"))
|
|
group.add_option("-v", "--verbose", action="store_true", dest="verbose",
|
|
help=_(
|
|
"""Log all checked URLs. Default is to log only errors and warnings."""))
|
|
group.add_option("--no-warnings", action="store_false", dest="warnings",
|
|
help=_("""Don't log warnings. Default is to log warnings."""))
|
|
group.add_option("-W", "--warning-regex", type="string", dest="warningregex",
|
|
metavar="REGEX",
|
|
help=_(
|
|
"""Define a regular expression which prints a warning if it matches
|
|
any content of the checked link. This applies only to valid pages,
|
|
so we can get their content.
|
|
|
|
Use this to check for pages that contain some form of error
|
|
message, for example 'This page has moved' or 'Oracle
|
|
Application Server error'."""))
|
|
group.add_option("--warning-size-bytes", dest="warningsizebytes",
|
|
metavar="NUMBER",
|
|
help=_(
|
|
"""Print a warning if content size info is available and exceeds the
|
|
given number of bytes."""))
|
|
group.add_option("-q", "--quiet", action="store_true", dest="quiet",
|
|
help=_(
|
|
"""Quiet operation, an alias for '-o none'.
|
|
This is only useful with -F."""))
|
|
group.add_option("-o", "--output", type="string", dest="output",
|
|
metavar="TYPE[/ENCODING]",
|
|
help=_(
|
|
"""Specify output as %(loggertypes)s. Default output type is text.
|
|
The ENCODING specifies the output encoding, the default is that of your
|
|
locale.
|
|
Valid encodings are listed at http://docs.python.org/lib/standard-encodings.html.""") % \
|
|
{'loggertypes': linkcheck.LoggerKeys})
|
|
group.add_option("-F", "--file-output", type="string", action="append",
|
|
dest="fileoutput", metavar="TYPE[/ENCODING][/FILENAME]",
|
|
help=_(
|
|
"""Output to a file linkchecker-out.TYPE, $HOME/.linkchecker/blacklist for
|
|
'blacklist' output, or FILENAME if specified.
|
|
The ENCODING specifies the output encoding, the default is that of your
|
|
locale.
|
|
Valid encodings are listed at http://docs.python.org/lib/standard-encodings.html.
|
|
The FILENAME and ENCODING parts of the 'none' output type will be ignored,
|
|
else if the file already exists, it will be overwritten.
|
|
You can specify this option more than once. Valid file output types
|
|
are %(loggertypes)s. You can specify this option multiple times to output
|
|
to more than one file. Default is no file output. Note that you can
|
|
suppress all console output with the option '-o none'.""") % \
|
|
{'loggertypes': linkcheck.LoggerKeys})
|
|
group.add_option("--no-status", action="store_false", dest="status",
|
|
default=True, help=_(
|
|
"""Do not print check status messages."""))
|
|
group.add_option("-D", "--debug", type="string", action="append",
|
|
metavar="STRING",
|
|
help=_("""Print debugging output for the given logger.
|
|
Available loggers are %(lognamelist)s.
|
|
Specifying 'all' is an alias for specifying all available loggers.
|
|
The option can be given multiple times to debug with more
|
|
than one logger.
|
|
|
|
For accurate results, threading will be disabled during debug runs.""") % \
|
|
{"lognamelist": linkcheck.lognamelist})
|
|
group.add_option("--trace", action="store_true", dest="trace",
|
|
help=_("""Print tracing information."""))
|
|
group.add_option("--profile", action="store_true", dest="profile",
|
|
help=_(
|
|
"""Write profiling data into a file named %s in the
|
|
current working directory. See also --viewprof.""") % _profile)
|
|
group.add_option("--viewprof", action="store_true", dest="viewprof",
|
|
help=_(
|
|
"""Print out previously generated profiling data. See also --profile."""))
|
|
optparser.add_option_group(group)
|
|
|
|
|
|
################# checking options ##################
|
|
group = optparse.OptionGroup(optparser, _("Checking options"))
|
|
group.add_option("-r", "--recursion-level", type="int", dest="recursionlevel",
|
|
metavar="NUMBER",
|
|
help=_(
|
|
"""Check recursively all links up to given depth. A negative depth
|
|
will enable infinite recursion. Default depth is infinite."""))
|
|
group.add_option("--no-follow-url", type="string", action="append",
|
|
metavar="REGEX",
|
|
dest="extern", help=_(
|
|
"""Check but do not recurse into URLs matching the given regular
|
|
expression. This option can be given multiple times."""))
|
|
group.add_option("--ignore-url", type="string", action="append",
|
|
metavar="REGEX",
|
|
dest="externstrict", help=_(
|
|
"""Only check syntax of URLs matching the given regular expression.
|
|
This option can be given multiple times."""))
|
|
group.add_option("-C", "--cookies", action="store_true", dest="cookies",
|
|
help=_(
|
|
"""Accept and send HTTP cookies according to RFC 2109. Only cookies
|
|
which are sent back to the originating server are accepted.
|
|
Sent and accepted cookies are provided as additional logging
|
|
information."""))
|
|
group.add_option("--cookiefile", type="string", dest="cookiefile",
|
|
metavar="FILENAME",
|
|
help=_(
|
|
"""Read a file with initial cookie data. The cookie data format is
|
|
explained below."""))
|
|
group.add_option("-a", "--anchors", action="store_true", dest="anchors",
|
|
help=_(
|
|
"""Check HTTP anchor references. Default is not to check anchors."""))
|
|
group.add_option("--no-anchor-caching", action="store_false",
|
|
dest="anchorcaching", help=_(
|
|
"""Treat url#anchora and url#anchorb as equal on caching. This
|
|
is the default browser behaviour, but it's not specified in
|
|
the URI specification. Use with care since broken anchors are not
|
|
guaranteed to be detected in this mode."""))
|
|
group.add_option("-u", "--user", type="string", dest="username",
|
|
metavar="STRING",
|
|
help=_(
|
|
"""Try the given username for HTTP and FTP authorization.
|
|
For FTP the default username is 'anonymous'. For HTTP there is
|
|
no default username. See also -p."""))
|
|
group.add_option("-p", "--password", type="string", dest="password",
|
|
metavar="STRING",
|
|
help=_(
|
|
"""Try the given password for HTTP and FTP authorization.
|
|
For FTP the default password is 'anonymous@'. For HTTP there is
|
|
no default password. See also -u."""))
|
|
group.add_option("--timeout", type="int", dest="timeout",
|
|
metavar="NUMBER",
|
|
help=_(
|
|
"""Set the timeout for connection attempts in seconds. The default
|
|
timeout is %d seconds.""") % config["timeout"])
|
|
group.add_option("-P", "--pause", type="int", dest="pause",
|
|
metavar="NUMBER",
|
|
help=_(
|
|
"""Pause the given number of seconds between two subsequent connection
|
|
requests to the same host. Default is no pause between requests."""))
|
|
group.add_option("-N", "--nntp-server", type="string", dest="nntpserver",
|
|
metavar="STRING",
|
|
help=_(
|
|
"""Specify an NNTP server for 'news:...' links. Default is the
|
|
environment variable NNTP_SERVER. If no host is given,
|
|
only the syntax of the link is checked."""))
|
|
group.add_option("--no-proxy-for", type="string", action="append",
|
|
metavar="REGEX",
|
|
dest="noproxyfor", help=_(
|
|
"""Contact hosts that match the given regular expression directly instead
|
|
of going through a proxy. This option can be given multiple times."""))
|
|
optparser.add_option_group(group)
|
|
|
|
################# auto completion #####################
|
|
if has_optcomplete:
|
|
import optcomplete
|
|
optcomplete.autocomplete(optparser)
|
|
|
|
# read and parse command line options and arguments
|
|
(options, args) = optparser.parse_args()
|
|
# initialize logging
|
|
if options.debug:
|
|
allowed_debugs = linkcheck.lognames.keys()
|
|
for _name in options.debug:
|
|
if _name not in allowed_debugs:
|
|
print_usage(_("Invalid debug level %(level)r") % {'level': _name})
|
|
config.init_logging(debug=options.debug)
|
|
assert None == linkcheck.log.debug(linkcheck.LOG_CMDLINE,
|
|
_("Python %(version)s on %(platform)s") % \
|
|
{"version": sys.version, "platform": sys.platform})
|
|
# read configuration files
|
|
try:
|
|
files = []
|
|
if options.configfile:
|
|
if os.path.isfile(options.configfile):
|
|
files.append(options.configfile)
|
|
else:
|
|
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
|
|
_("Unreadable config file: %r"), options.configfile)
|
|
config.read(files=files)
|
|
except linkcheck.LinkCheckerError, msg:
|
|
# config error
|
|
print_usage(str(msg))
|
|
# test if running with root privileges
|
|
if not options.allowroot:
|
|
check_user()
|
|
# test if running with -O
|
|
if options.debug and not __debug__:
|
|
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
|
|
_("Running with python -O disables debugging."))
|
|
# apply commandline options and arguments
|
|
constructauth = False
|
|
do_profile = False
|
|
if not options.priority:
|
|
import linkcheck.threader
|
|
linkcheck.threader.set_thread_priority(linkcheck.threader.PRIO_LOW)
|
|
if options.warnings is not None:
|
|
config["warnings"] = options.warnings
|
|
if options.anchors is not None:
|
|
config["anchors"] = options.anchors
|
|
config["warnings"] = True
|
|
if options.extern:
|
|
pats = [linkcheck.get_link_pat(arg) for arg in options.extern]
|
|
config["externlinks"].extend(pats)
|
|
if options.externstrict:
|
|
pats = [linkcheck.get_link_pat(arg, strict=True) \
|
|
for arg in options.externstrict]
|
|
config["externlinks"].extend(pats)
|
|
if options.noproxyfor:
|
|
ros = [try_compile_re(arg) for arg in options.noproxyfor]
|
|
config["noproxyfor"].extend(ros)
|
|
if options.output:
|
|
if "/" in options.output:
|
|
logtype, encoding = options.output.split("/", 1)
|
|
else:
|
|
logtype, encoding = options.output, linkcheck.i18n.default_encoding
|
|
if not linkcheck.Loggers.has_key(logtype.lower()):
|
|
print_usage(
|
|
_("Unknown logger type %(type)r in %(output)r for option %(option)s") % \
|
|
{"type": logtype, "output": options.output, "option": "'-o, --output'"})
|
|
if logtype != 'none' and not has_encoding(encoding):
|
|
print_usage(
|
|
_("Unknown encoding %(encoding)r in %(output)r for option %(option)s") % \
|
|
{"encoding": encoding, "output": options.output,
|
|
"option": "'-o, --output'"})
|
|
config['logger'] = config.logger_new(logtype.lower(), encoding=encoding)
|
|
if options.fileoutput:
|
|
ns = {'fileoutput': 1}
|
|
for arg in options.fileoutput:
|
|
ftype = arg
|
|
# look for (optional) filename and encoding
|
|
if '/' in ftype:
|
|
ftype, suffix = ftype.split('/', 1)
|
|
if suffix:
|
|
if has_encoding(suffix):
|
|
# it was an encoding
|
|
ns['encoding'] = suffix
|
|
elif '/' in suffix:
|
|
# look for (optional) encoding
|
|
encoding, filename = suffix.split('/', 1)
|
|
if has_encoding(encoding):
|
|
ns['encoding'] = encoding
|
|
ns['filename'] = filename
|
|
else:
|
|
ns['filename'] = suffix
|
|
else:
|
|
ns['filename'] = suffix
|
|
if not linkcheck.Loggers.has_key(ftype):
|
|
print_usage(
|
|
_("Unknown logger type %(type)r in %(output)r for option %(option)s") % \
|
|
{"type": ftype, "output": options.output,
|
|
"option": "'-F, --file-output'"})
|
|
if ftype != 'none' and 'encoding' in ns and \
|
|
not has_encoding(ns['encoding']):
|
|
print_usage(
|
|
_("Unknown encoding %(encoding)r in %(output)r for option %(option)s") % \
|
|
{"encoding": ns['encoding'], "output": options.output,
|
|
"option": "'-F, --file-output'"})
|
|
logger = config.logger_new(ftype, **ns)
|
|
config['fileoutput'].append(logger)
|
|
if options.interactive is not None:
|
|
config['interactive'] = options.interactive
|
|
if options.nntpserver:
|
|
config["nntpserver"] = options.nntpserver
|
|
if options.anchorcaching is not None:
|
|
config["anchorcaching"] = options.anchorcaching
|
|
if options.password is not None:
|
|
_password = options.password
|
|
constructauth = True
|
|
if options.pause is not None:
|
|
if options.pause >= 0:
|
|
config["wait"] = options.pause
|
|
else:
|
|
print_usage(_("Illegal argument %(arg)r for option %(option)s") % \
|
|
{"arg": options.pause, "option": "'-P, --pause'"})
|
|
if options.profile is not None:
|
|
do_profile = options.profile
|
|
if options.quiet is not None:
|
|
config['logger'] = config.logger_new('none')
|
|
if options.recursionlevel is not None:
|
|
config["recursionlevel"] = options.recursionlevel
|
|
if options.status is not None:
|
|
config['status'] = options.status
|
|
if options.threads is not None:
|
|
if options.threads < 1:
|
|
options.threads = 0
|
|
config["threads"] = options.threads
|
|
if options.timeout is not None:
|
|
if options.timeout > 0:
|
|
config["timeout"] = options.timeout
|
|
else:
|
|
print_usage(_("Illegal argument %(arg)r for option %(option)s") % \
|
|
{"arg": options.timeout, "option": "'--timeout'"})
|
|
socket.setdefaulttimeout(config["timeout"])
|
|
if options.username is not None:
|
|
_username = options.username
|
|
constructauth = True
|
|
if options.version is not None:
|
|
print_version()
|
|
if options.verbose is not None:
|
|
if options.verbose:
|
|
config["verbose"] = True
|
|
config["warnings"] = True
|
|
if options.viewprof:
|
|
viewprof()
|
|
if options.warningregex is not None:
|
|
config["warningregex"] = try_compile_re(options.warningregex)
|
|
config["warnings"] = True
|
|
if options.warningsizebytes is not None:
|
|
config["warnsizebytes"] = options.warningsizebytes
|
|
if options.cookies is not None:
|
|
config['storecookies'] = options.cookies
|
|
config['sendcookies'] = options.cookies
|
|
if constructauth:
|
|
config["authentication"].append({'pattern': try_compile_re(".+"),
|
|
'user': _username,
|
|
'password': _password})
|
|
|
|
assert None == linkcheck.log.debug(linkcheck.LOG_CMDLINE,
|
|
"configuration: %s", pprint.pformat(config.items()))
|
|
# warn about sitemap loggers and verbose output
|
|
klasses = [c.__class__ for c in [config['logger']] + config['fileoutput']]
|
|
if (linkcheck.logger.gml.GMLLogger in klasses or \
|
|
linkcheck.logger.dot.DOTLogger in klasses or \
|
|
linkcheck.logger.gxml.GraphXMLLogger in klasses) and \
|
|
not config['verbose']:
|
|
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
|
|
_("Using DOT or GML loggers without verbose output"
|
|
" gives an incomplete sitemap graph."))
|
|
|
|
# interactive input
|
|
if len(args) <= 0:
|
|
if config['interactive']:
|
|
urls = raw_input(
|
|
_("enter one or more URLs, separated by white-space\n--> "))
|
|
args = urls.split()
|
|
else:
|
|
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given"))
|
|
|
|
# prepare checking queue
|
|
aggregate = linkcheck.director.get_aggregate(config)
|
|
if options.cookiefile is not None:
|
|
try:
|
|
cookies = linkcheck.cookies.from_file(options.cookiefile)
|
|
for headers, scheme, host, path in cookies:
|
|
aggregate.cookies.add(headers, scheme, host, path)
|
|
config["sendcookies"] = True
|
|
except StandardError:
|
|
linkcheck.log.error(linkcheck.LOG_CMDLINE,
|
|
_("Could not parse cookie file: %s"), sys.exc_info()[1])
|
|
sys.exit(1)
|
|
if options.trace:
|
|
config["trace"] = True
|
|
import linkcheck.trace
|
|
linkcheck.trace.trace_filter([r"^linkcheck"])
|
|
linkcheck.trace.trace_on()
|
|
# add urls to queue
|
|
get_url_from = linkcheck.checker.get_url_from
|
|
for url in args:
|
|
if url.lower().startswith("www."):
|
|
# syntactic sugar
|
|
url = "http://%s" % url
|
|
elif url.lower().startswith("ftp."):
|
|
# syntactic sugar
|
|
url = "ftp://%s" % url
|
|
url_data = get_url_from(url, 0, aggregate, assume_local=True)
|
|
try:
|
|
linkcheck.add_intern_pattern(url_data, config)
|
|
except UnicodeError:
|
|
linkcheck.log.error(linkcheck.LOG_CMDLINE,
|
|
_("URL has unparsable domain name: %s"), sys.exc_info()[1])
|
|
sys.exit(1)
|
|
aggregate.urlqueue.put(url_data)
|
|
# set up profiling
|
|
if do_profile:
|
|
if has_profile:
|
|
if os.path.exists(_profile):
|
|
question = _("Overwrite profiling file %r?\n"
|
|
"Press Ctrl-C to cancel, RETURN to continue.") % _profile
|
|
try:
|
|
raw_input(question)
|
|
except KeyboardInterrupt:
|
|
print >>sys.stderr
|
|
print >>sys.stderr, _("Canceled.")
|
|
sys.exit(1)
|
|
else:
|
|
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
|
|
_("The `profile' Python module is not installed,"
|
|
" therefore the --profile option is disabled."))
|
|
do_profile = False
|
|
|
|
# start checking
|
|
if do_profile:
|
|
import profile
|
|
profile.run("linkcheck.director.check_urls(aggregate)", _profile)
|
|
else:
|
|
linkcheck.director.check_urls(aggregate)
|
|
|
|
# interactive input end
|
|
if config['interactive']:
|
|
raw_input(_("Hit RETURN to finish"))
|
|
|
|
# if errors or warnings are encountered, exit with non-zero status
|
|
if config['logger'].errors or \
|
|
(config['logger'].warnings and config['warnings']):
|
|
sys.exit(1)
|