mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-23 01:10:27 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1282 e7d03fd6-7b0d-0410-9947-9c21f3af8025
428 lines
17 KiB
Python
Executable file
428 lines
17 KiB
Python
Executable file
#!/usr/bin/python -O
|
|
# -*- coding: iso-8859-1 -*-
|
|
"""check HTML pages for broken links"""
|
|
# Copyright (C) 2000-2004 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
# imports and checks
|
|
import sys
|
|
if not hasattr(sys, 'version_info') or sys.version_info<(2, 3, 0, 'final', 0):
|
|
raise SystemExit, "This program requires Python 2.3 or later."
|
|
|
|
import getopt, re, os, pprint, socket, linkcheck
|
|
# set default 60 seconds timeout
|
|
default_timeout = 60
|
|
socket.setdefaulttimeout(default_timeout)
|
|
# import several helper debugging things
|
|
from linkcheck.debug import *
|
|
from linkcheck.log import LoggerKeys
|
|
from linkcheck import StringUtil, Config, i18n
|
|
# some default values
|
|
_profile = "linkchecker.prof"
|
|
_username = "anonymous"
|
|
_password = "guest@"
|
|
|
|
# main usage text
|
|
Usage = i18n._("""USAGE\tlinkchecker [options] file-or-url...
|
|
""")
|
|
|
|
Notes = i18n._("""NOTES
|
|
o A ! before any regex negates it. So '!^mailto:' matches everything but
|
|
a mailto link.
|
|
o LinkCheckers commandline parser treats "ftp." links like "ftp://ftp."
|
|
and "www." links like "http://www.".
|
|
You can also give local files as arguments.
|
|
o If you have your system configured to automatically establish a
|
|
connection to the internet (e.g. with diald), it will connect when
|
|
checking links not pointing to your local host.
|
|
Use the -s and -i options to prevent this.
|
|
o Javascript links are currently ignored.
|
|
o If your platform does not support threading, LinkChecker uses -t0.
|
|
o You can supply multiple user/password pairs in a configuration file.
|
|
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
|
|
on Unix or Windows.
|
|
On a Mac use the Internet Config.
|
|
o When checking 'news:' links the given NNTP host doesn't need to be the
|
|
same as the host of the user browsing your pages!
|
|
""")
|
|
|
|
Examples = i18n._(r"""EXAMPLES
|
|
Check the treasure.calvinsplayground.de domain, but don't recurse into
|
|
links outside of this domain:
|
|
linkchecker -v --status -r5 -Fhtml -itreasure\.calvinsplayground\.de \
|
|
-ocolored http://treasure.calvinsplayground.de/
|
|
|
|
Don't connect to mailto: hosts, only check their URL syntax. All other
|
|
links are checked as usual:
|
|
linkchecker --intern='!^mailto:' --strict www.mysite.org
|
|
|
|
Checking a local HTML file on Unix:
|
|
linkchecker ../bla.html
|
|
|
|
Checking a local HTML file on Windows:
|
|
linkchecker c:\temp\test.html
|
|
|
|
You can skip the "http://" url part if the domain starts with "www.":
|
|
linkchecker www.myhomepage.de
|
|
|
|
You can skip the "ftp://" url part if the domain starts with "ftp.":
|
|
linkchecker -r0 ftp.linux.org
|
|
""")
|
|
|
|
def printVersion ():
|
|
"""print the program version and exit"""
|
|
print linkcheck.Config.AppInfo
|
|
sys.exit(0)
|
|
|
|
def printUsage (msg):
|
|
"""print a program msg text to stderr and exit"""
|
|
sys.stderr.write(i18n._("Error: %s\n") % msg)
|
|
sys.stderr.write(i18n._("Execute 'linkchecker -h' for help\n"))
|
|
sys.exit(1)
|
|
|
|
|
|
def viewprof ():
|
|
"""print profiling data and exit"""
|
|
if not os.path.exists(_profile):
|
|
sys.stderr.write(i18n._("Could not find profiling file %s.")%_profile)
|
|
sys.stderr.write(i18n._("Please run linkchecker with --profile to generate it."))
|
|
sys.exit(1)
|
|
import pstats
|
|
stats = pstats.Stats(_profile)
|
|
stats.strip_dirs().sort_stats("cumulative").print_stats(50)
|
|
sys.exit(0)
|
|
|
|
# Read command line arguments
|
|
from optparse import OptionParser, OptionGroup
|
|
|
|
class LCOptionParser (OptionParser):
|
|
|
|
def error (self, msg):
|
|
printUsage(msg)
|
|
|
|
def get_usage (self):
|
|
return Usage
|
|
|
|
def print_help (self, file=None):
|
|
s = "%s\n%s\n%s"%(self.format_help(), Notes, Examples)
|
|
if os.name!='posix':
|
|
StringUtil.paginate(s)
|
|
else:
|
|
print s
|
|
sys.exit(0)
|
|
|
|
optparser = LCOptionParser()
|
|
|
|
################# general options ##################
|
|
group = OptionGroup(optparser, i18n._("General options"))
|
|
group.add_option("-f", "--config", type="string", dest="configfile",
|
|
help=i18n._(
|
|
"""Use file as configuration file. As default LinkChecker first
|
|
searches /etc/linkcheckerrc and then ~/.linkcheckerrc
|
|
(under Windows <path-to-program>\\linkcheckerrc)."""))
|
|
group.add_option("-I", "--interactive", action="store_true", dest="interactive",
|
|
help=i18n._(
|
|
"""Ask for url if none are given on the commandline."""))
|
|
group.add_option("-t", "--threads", type="int", dest="threads",
|
|
help=i18n._(
|
|
"""Generate no more than num threads. Default number of threads is 5.
|
|
To disable threading specify a non-positive number."""))
|
|
|
|
group.add_option("-V", "--version", action="store_true", dest="version",
|
|
help=i18n._(
|
|
"""Print version and exit."""))
|
|
optparser.add_option_group(group)
|
|
|
|
|
|
################# output options ##################
|
|
group = OptionGroup(optparser, i18n._("Output options"))
|
|
group.add_option("-v", "--verbose", action="store_true", dest="verbose",
|
|
help=i18n._(
|
|
"""Log all checked URLs (implies -w). Default is to log only invalid
|
|
URLs."""))
|
|
group.add_option("-w", "--warnings", action="store_true", dest="warnings",
|
|
help=i18n._("""Log warnings."""))
|
|
group.add_option("-W", "--warning-regex", type="string", dest="warningregex",
|
|
help=i18n._(
|
|
"""Define a regular expression which prints a warning if it matches
|
|
any content of the checked link.
|
|
This applies of course only to pages which are valid, so we can
|
|
get their content.
|
|
Use this to check for pages that contain some form of error
|
|
message, for example 'This page has moved' or 'Oracle
|
|
Application Server error'.
|
|
This option implies -w."""))
|
|
group.add_option("--warning-size-bytes", dest="warningsizebytes",
|
|
help=i18n._(
|
|
"""Print a warning if content size is available and exceeds the given
|
|
number of bytes. This option implies -w."""))
|
|
group.add_option("-q", "--quiet", action="store_true", dest="quiet",
|
|
help=i18n._(
|
|
"""Quiet operation. This is only useful with -F."""))
|
|
group.add_option("-o", "--output", type="string", dest="output",
|
|
help=i18n._(
|
|
"""Specify output as %s. Default output type is text.""")%LoggerKeys)
|
|
group.add_option("-F", "--file-output", type="string", action="append",
|
|
dest="fileoutput", help=i18n._(
|
|
"""type[/filename]
|
|
Output to a file linkchecker-out.<type>, $HOME/.linkchecker_blacklist for
|
|
'blacklist' output, or <filename> if specified.
|
|
The <filename> part of the 'none' output type will be ignored,
|
|
else if the file already exists, it will be overwritten.
|
|
You can specify this option more than once. Valid file output types
|
|
are %s.
|
|
Default is no file output. If console output is not specified with -o,
|
|
this option suppresses all console output by implying -o none.""")%LoggerKeys)
|
|
group.add_option("-D", "--debug", action="count",
|
|
help=i18n._(
|
|
"""Print debugging information. Provide this option multiple times
|
|
for even more debugging information."""))
|
|
group.add_option("--status", action="store_true", dest="status",
|
|
help=i18n._(
|
|
"""Print check status every 5 seconds to stderr."""))
|
|
group.add_option("--profile", action="store_true", dest="profile",
|
|
help=i18n._(
|
|
"""Write profiling data into a file named %s in the
|
|
current working directory. See also --viewprof.""")%_profile)
|
|
group.add_option("--viewprof", action="store_true", dest="viewprof",
|
|
help=i18n._(
|
|
"""Print out previously generated profiling data. See also --profile."""))
|
|
optparser.add_option_group(group)
|
|
|
|
|
|
################# checking options ##################
|
|
group = OptionGroup(optparser, i18n._("Checking options"))
|
|
group.add_option("-r", "--recursion-level", type="int", dest="recursionlevel",
|
|
help=i18n._(
|
|
"""Check recursively all links up to given depth. A negative depth
|
|
will enable inifinite recursion. Default depth is 1."""))
|
|
group.add_option("-e", "--extern", type="string", action="append", dest="extern",
|
|
help=i18n._(
|
|
"""Assume urls that match the given expression as external.
|
|
Only internal HTML links are checked recursively."""))
|
|
group.add_option("-i", "--intern", type="string", action="append", dest="intern",
|
|
help=i18n._(
|
|
""" regex, --intern=regex
|
|
Assume URLs that match the given expression as internal.
|
|
LinkChecker descends recursively only to internal URLs, not to
|
|
external."""))
|
|
group.add_option("-d", "--denyallow", action="store_true", dest="denyallow",
|
|
help=i18n._(
|
|
"""Swap checking order to external/internal. Default checking order
|
|
is internal/external."""))
|
|
group.add_option("-s", "--strict", action="store_true", dest="strict",
|
|
help=i18n._(
|
|
"""Check only syntax of external links, do not try to connect to them.
|
|
For local file urls, only local files are internal. For
|
|
http and ftp urls, all urls at the same domain name are internal."""))
|
|
group.add_option("-C", "--cookies", action="store_true", dest="cookies",
|
|
help=i18n._(
|
|
"""Accept and send HTTP cookies according to RFC 2109. Only cookies
|
|
which are sent back to the originating server are accepted.
|
|
Sent and accepted cookies are provided as additional logging
|
|
information."""))
|
|
group.add_option("-a", "--anchors", action="store_true", dest="anchors",
|
|
help=i18n._(
|
|
"""Check HTTP anchor references. This option applies to both internal
|
|
and external urls. Default is don't check anchors.
|
|
This option implies -w because anchor errors are always warnings."""))
|
|
group.add_option("--no-anchor-caching", action="store_false", dest="anchorcaching",
|
|
help=i18n._(
|
|
"""Treat url#anchora and url#anchorb as equal on caching. This
|
|
is the default browser behaviour, but it's not specified in
|
|
the URI specification. Use with care."""))
|
|
group.add_option("-u", "--user", type="string", dest="username",
|
|
help=i18n._(
|
|
"""Try given username for HTTP and FTP authorization.
|
|
Default is %r. See also -p.""")%_username)
|
|
group.add_option("-p", "--password", type="string", dest="password",
|
|
help=i18n._(
|
|
"""Try given password for HTTP and FTP authorization.
|
|
Default password is %r. See also -u.""")%_password)
|
|
group.add_option("--timeout", type="int", dest="timeout",
|
|
help=i18n._(
|
|
"""Set the timeout for TCP connection attempts in seconds. The default
|
|
timeout is %d seconds.""") % default_timeout)
|
|
group.add_option("-P", "--pause", type="int", dest="pause",
|
|
help=i18n._(
|
|
"""Pause <secs> seconds between each url check. This option implies -t0.
|
|
Default is no pause between requests."""))
|
|
group.add_option("-N", "--nntp-server", type="string", dest="nntpserver",
|
|
help=i18n._(
|
|
"""Specify an NNTP server for 'news:...' links. Default is the
|
|
environment variable NNTP_SERVER. If no host is given,
|
|
only the syntax of the link is checked."""))
|
|
optparser.add_option_group(group)
|
|
|
|
|
|
################# deprecated options ##################
|
|
group = OptionGroup(optparser, i18n._("Deprecated options"))
|
|
group.add_option("-R", "--robots-txt", action="store_true")
|
|
optparser.add_option_group(group)
|
|
|
|
################# auto completion #####################
|
|
from linkcheck import optcomplete
|
|
optcomplete.autocomplete(optparser)
|
|
|
|
if "--wischiwaschi" in sys.argv:
|
|
from linkcheck import util1
|
|
util1.abbuzze()
|
|
sys.exit(0)
|
|
|
|
(options, args) = optparser.parse_args()
|
|
|
|
# set debug level as early as possible
|
|
if options.debug is not None:
|
|
set_debuglevel(options.debug)
|
|
debug(BRING_IT_ON, "Python", sys.version, "on", sys.platform)
|
|
# config object
|
|
config = linkcheck.Config.Configuration()
|
|
# read configuration from config files
|
|
configfiles = []
|
|
if options.configfile:
|
|
configfiles.append(options.configfile)
|
|
config.read(configfiles)
|
|
# apply commandline options and arguments
|
|
constructauth = False
|
|
do_profile = False
|
|
if options.anchors is not None:
|
|
config["anchors"] = options.anchors
|
|
config["warnings"] = True
|
|
if options.extern:
|
|
config["externlinks"].extend([linkcheck.getLinkPat(arg) for arg in options.extern])
|
|
if options.output:
|
|
if linkcheck.log.Loggers.has_key(options.output):
|
|
config['log'] = config.newLogger(options.output)
|
|
else:
|
|
printUsage(i18n._("Illegal argument %r for option %s") % \
|
|
(options.output, "'-o, --output'"))
|
|
if options.fileoutput:
|
|
ns = {'fileoutput': 1}
|
|
for ftype in options.fileoutput:
|
|
try:
|
|
ftype, ns['filename'] = ftype.split('/', 1)
|
|
if not ns['filename']: raise ValueError
|
|
except ValueError:
|
|
pass
|
|
if linkcheck.log.Loggers.has_key(ftype):
|
|
config['fileoutput'].append(config.newLogger(ftype, ns))
|
|
else:
|
|
printUsage(i18n._("Illegal argument %r for option %s") % \
|
|
(ftype, "'-F, --file-output'"))
|
|
if not options.output:
|
|
config['log'] = config.newLogger('none')
|
|
if options.interactive is not None:
|
|
config['interactive'] = options.interactive
|
|
if options.intern:
|
|
config["internlinks"].extend([linkcheck.getLinkPat(arg) for arg in options.intern])
|
|
if options.denyallow is not None:
|
|
config["denyallow"] = options.denyallow
|
|
if options.nntpserver:
|
|
config["nntpserver"] = options.nntpserver
|
|
if options.anchorcaching is not None:
|
|
config["anchorcaching"] = options.anchorcaching
|
|
if options.password is not None:
|
|
_password = options.password
|
|
constructauth = True
|
|
if options.pause is not None:
|
|
if options.pause >= 0:
|
|
config["wait"] = options.pause
|
|
else:
|
|
printUsage(i18n._("Illegal argument %d for option %s") % \
|
|
(options.pause, "'-P, --pause'"))
|
|
if options.profile is not None:
|
|
do_profile = options.profile
|
|
if options.quiet is not None:
|
|
config["quiet"] = options.quiet
|
|
if options.recursionlevel is not None:
|
|
config["recursionlevel"] = options.recursionlevel
|
|
if options.strict is not None:
|
|
config["strict"] = options.strict
|
|
if options.status is not None:
|
|
config['status'] = options.status
|
|
if options.threads is not None:
|
|
config.setThreads(options.threads)
|
|
if options.timeout is not None:
|
|
if options.timeout > 0:
|
|
socket.setdefaulttimeout(options.timeout)
|
|
else:
|
|
printUsage(i18n._("Illegal argument %r for option %s") % \
|
|
(options.timeout, "'--timeout'"))
|
|
if options.username is not None:
|
|
_username = options.username
|
|
constructauth = True
|
|
if options.version is not None:
|
|
printVersion()
|
|
if options.verbose is not None:
|
|
if options.verbose:
|
|
config["verbose"] = True
|
|
config["warnings"] = True
|
|
if options.viewprof:
|
|
viewprof()
|
|
if options.warnings is not None:
|
|
config["warnings"] = options.warnings
|
|
if options.warningregex is not None:
|
|
config["warningregex"] = re.compile(options.warningregex)
|
|
config["warnings"] = True
|
|
if options.warningsizebytes is not None:
|
|
config["warnsizebytes"] = options.warningsizebytes
|
|
if options.cookies is not None:
|
|
config['cookies'] = options.cookies
|
|
if constructauth:
|
|
config["authentication"].insert(0, {'pattern': re.compile(".*"),
|
|
'user': _username,
|
|
'password': _password})
|
|
|
|
debug(HURT_ME_PLENTY, "configuration:", pprint.pformat(config.items()))
|
|
|
|
# interactive input
|
|
if len(args)==0:
|
|
if config['interactive']:
|
|
urls = raw_input(i18n._("enter one or more urls, separated by white-space\n--> "))
|
|
args = urls.split()
|
|
else:
|
|
warn(i18n._("no files or urls given"))
|
|
|
|
# syntactic sugar
|
|
from linkcheck import UrlData
|
|
for url in args:
|
|
url = url.strip()
|
|
if ":" not in url:
|
|
if url.startswith("www."):
|
|
url = "http://%s"%url
|
|
elif url.startswith("ftp."):
|
|
url = "ftp://%s"%url
|
|
config.appendUrl(UrlData.GetUrlDataFrom(url, 0, config, cmdline=True))
|
|
|
|
############################# check the urls ################################
|
|
if do_profile:
|
|
import profile
|
|
profile.run("linkcheck.checkUrls(config)", _profile)
|
|
else:
|
|
# do not use psyco, at the moment (Oct 2003) it has bugs causing
|
|
# infinite loops when threads are enabled, and psyco disables
|
|
# the Ctrl-C break button of the Python interpreter.
|
|
#try:
|
|
# import psyco
|
|
# psyco.full()
|
|
#except ImportError:
|
|
# pass
|
|
linkcheck.checkUrls(config)
|
|
#############################################################################
|
|
|
|
# interactive input end
|
|
if config['interactive']:
|
|
raw_input(i18n._("Hit RETURN to finish"))
|