linkchecker/linkchecker
2003-12-05 00:38:30 +00:00

454 lines
15 KiB
Python
Executable file

#!/usr/bin/python -O
# -*- coding: iso-8859-1 -*-
"""check HTML pages for broken links"""
# Copyright (C) 2000-2002 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# imports and checks
import sys
if not hasattr(sys, 'version_info') or sys.version_info<(2, 3, 0, 'final', 0):
raise SystemExit, "This program requires Python 2.3 or later."
# do not use psyco, at the moment (Oct 2003) it has bugs causing
# infinite loops when threads are enabled, and psyco disables
# the Ctrl-C break button of the Python interpreter.
#try:
# import psyco
# psyco.profile()
#except ImportError:
# pass
import getopt, re, os, pprint, socket, linkcheck
# set default 30 seconds timeout
socket.setdefaulttimeout(30)
# import several helper debugging things
from linkcheck.debug import *
from linkcheck.log import LoggerKeys
from linkcheck import StringUtil, Config, i18n
_profile = "linkchecker.prof"
# main usage text
Usage = i18n._("""USAGE\tlinkchecker [options] file-or-url...
OPTIONS
For single-letter option arguments the space is not a necessity. So
'-o colored' is the same as '-ocolored'.
-a, --anchors
Check HTTP anchor references. This option applies to both internal
and external urls. Default is don't check anchors.
This option implies -w because anchor errors are always warnings.
-C, --cookies
Accept and send HTTP cookies according to RFC 2109. Only cookies
which are sent back to the originating server are accepted.
Sent and accepted cookies are provided as additional logging
information.
-d, --denyallow
Swap checking order to external/internal. Default checking order
is internal/external.
-D, --debug
Print debugging information. Provide this option multiple times
for even more debugging information.
-e regex, --extern=regex
Assume urls that match the given expression as external.
Only internal HTML links are checked recursively.
-f file, --config=file
Use file as configuration file. As default LinkChecker first
searches /etc/linkcheckerrc and then ~/.linkcheckerrc
(under Windows <path-to-program>\\linkcheckerrc).
-F type[/filename], --file-output=type[/filename]
Same as -o, but write to a file linkchecker-out.<type>
or <filename> if specified. If the file already exists, it
is overwritten. You can specify this option more than once.
There is no file output for the blacklist logger. Default is
no file output.
-I, --interactive
Ask for url if none are given on the commandline.
-i regex, --intern=regex
Assume URLs that match the given expression as internal.
LinkChecker descends recursively only to internal URLs, not to
external.
-h, --help
Help me! Print usage information for this program.
-N server, --nntp-server=server
Specify an NNTP server for 'news:...' links. Default is the
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked.
--no-anchor-caching
Treat url#anchora and url#anchorb as equal on caching. This
is the default browser behaviour, but it's not specified in
the URI specification. Use with care.
-o type, --output=type
Specify output type as %s.
Default type is text.
-p pwd, --password=pwd
Try password pwd for HTTP and FTP authorization.
Default password is 'joe@'. See also -u.
-P secs, --pause=secs
Pause <secs> seconds between each url check. This option
implies -t0.
Default is no pause between requests.
--profile
Write profiling data into a file named %s in the
current working directory.
See also --viewprof.
-q, --quiet
Quiet operation. This is only useful with -F.
-r depth, --recursion-level=depth
Check recursively all links up to given depth. A negative depth
will enable inifinite recursion.
Default depth is 1.
-s, --strict
Check only syntax of external links, do not try to connect to them.
For local file urls, only local files are internal. For
http and ftp urls, all urls at the same domain name are internal.
-t num, --threads=num
Generate no more than num threads. Default number of threads is 5.
To disable threading specify a non-positive number.
--timeout=secs
Set the timeout for TCP connection attempts in seconds. The default
timeout is 30 seconds.
-u name, --user=name
Try username name for HTTP and FTP authorization.
Default is 'anonymous'. See also -p.
-V, --version
Print version and exit.
-v, --verbose
Log all checked URLs (implies -w). Default is to log only invalid
URLs.
--viewprof
Print out previously generated profiling data. See also --profile.
-w, --warnings
Log warnings.
-W regex, --warning-regex=regex
Define a regular expression which prints a warning if it matches
any content of the checked link.
This applies of course only to pages which are valid, so we can
get their content.
Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application Server error'.
This option implies -w.
--warning-size-bytes=bytes
Print a warning if content size is available and exceeds the given
number of bytes.
This option implies -w.
""") % (LoggerKeys, _profile)
Notes = i18n._("""NOTES
o A ! before any regex negates it. So '!^mailto:' matches everything but
a mailto link.
o LinkCheckers commandline parser treats "ftp." links like "ftp://ftp."
and "www." links like "http://www.".
You can also give local files as arguments.
o If you have your system configured to automatically establish a
connection to the internet (e.g. with diald), it will connect when
checking links not pointing to your local host.
Use the -s and -i options to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker uses -t0.
o You can supply multiple user/password pairs in a configuration file.
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
on Unix or Windows.
On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
same as the host of the user browsing your pages!
""")
Examples = i18n._("""EXAMPLES
o Check the treasure.calvinsplayground.de domain, but don't recurse into
links outside of this domain:
linkchecker -v -r5 -ohtml -s \\
http://treasure.calvinsplayground.de/~calvin/ > sample.html
o Don't connect to mailto: hosts, only check their URL syntax. All other
links are checked as usual:
linkchecker --intern='!^mailto:' --strict www.mysite.org
o Local files and syntactic sugar on the command line:
linkchecker c:\\temp\\test.html
linkchecker ../bla.html
linkchecker www.myhomepage.de
linkchecker -r0 ftp.linux.org
""")
def printVersion ():
"""print the program version and exit"""
print linkcheck.Config.AppInfo
sys.exit(0)
def printHelp ():
"""print the program help text and exit"""
if os.name!='posix':
StringUtil.paginate(Usage+"\n"+Notes+"\n"+Examples)
else:
print Usage
print Notes
print Examples
sys.exit(0)
def printUsage (msg):
"""print a program msg text to stderr and exit"""
sys.stderr.write(i18n._("Error: %s\n") % msg)
sys.stderr.write(i18n._("Execute 'linkchecker -h' for help\n"))
sys.exit(1)
def viewprof ():
"""print profiling data and exit"""
if not os.path.exists(_profile):
sys.stderr.write(i18n._("Could not find profiling file %s.")%_profile)
sys.stderr.write(i18n._("Please run linkchecker with --profile to generate it."))
sys.exit(1)
import pstats
stats = pstats.Stats(_profile)
stats.strip_dirs().sort_stats("cumulative").print_stats(50)
sys.exit(0)
# Read command line arguments
try:
# Note: cut out the name of the script
options, args = getopt.getopt(sys.argv[1:],
"adCDe:f:F:hIi:N:o:p:P:qr:Rst:u:VvwW:", # short options
["anchors", # long options
"config=",
"cookies",
"debug",
"extern=",
"file-output=",
"nntp-server=",
"help",
"interactive",
"intern=",
"denyallow",
"output=",
"password=",
"pause=",
"profile",
"quiet",
"recursion-level=",
"no-anchor-caching",
"wischiwaschi",
"robots-txt",
"strict",
"threads=",
"timeout=",
"user=",
"version",
"verbose",
"viewprof",
"warnings",
"warning-regex="])
except getopt.error:
type, value = sys.exc_info()[:2]
printUsage(value)
# set debug level as early as possible
for opt,arg in options:
if opt=="-D" or opt=="--debug":
set_debuglevel(get_debuglevel()+1)
debug(BRING_IT_ON, "Python", sys.version, "on", sys.platform)
# read configuration from config files
config = linkcheck.Config.Configuration()
configfiles = []
for opt,arg in options:
if opt=="-f" or opt=="--config":
configfiles.append(arg)
config.read(configfiles)
# disable threading for debugging
if get_debuglevel() > 0:
config.disableThreading()
# apply commandline options and arguments
_user = "anonymous"
_password = "guest@"
constructauth = False
do_profile = False
for opt,arg in options:
if opt=="-a" or opt=="--anchors":
config["anchors"] = True
config["warnings"] = True
elif opt=="-e" or opt=="--extern":
config["externlinks"].append(linkcheck.getLinkPat(arg))
elif opt=="-h" or opt=="--help":
printHelp()
elif opt=="-o" or opt=="--output":
if linkcheck.log.Loggers.has_key(arg):
config['log'] = config.newLogger(arg)
else:
printUsage((i18n._("Illegal argument '%s' for option ") % arg) +\
"'-o, --output'")
elif opt=="-F" or opt=="--file-output":
ns = {'fileoutput': 1}
try:
type, ns['filename'] = arg.split('/', 1)
if not ns['filename']: raise ValueError
except ValueError: type = arg
if linkcheck.log.Loggers.has_key(type) and type != "blacklist":
config['fileoutput'].append(config.newLogger(type, ns))
else:
printUsage((i18n._("Illegal argument '%s' for option ") % arg) +\
"'-F, --file-output'")
elif opt=="-I" or opt=="--interactive":
config['interactive'] = True
elif opt=="-i" or opt=="--intern":
config["internlinks"].append(linkcheck.getLinkPat(arg))
elif opt=="-l" or opt=="--denyallow":
config["denyallow"] = True
elif opt=="-N" or opt=="--nntp-server":
config["nntpserver"] = arg
elif opt=="--no-anchor-caching":
config["anchorcaching"] = False
elif opt=="-p" or opt=="--password":
_password = arg
constructauth = True
elif opt=="-P" or opt=="--pause":
try:
wait = int(arg)
except ValueError:
printUsage(i18n._("Illegal argument %s for option %s") % \
(`arg`, "'-P, --pause'"))
if wait >= 0:
config["wait"] = wait
else:
printUsage(i18n._("Illegal argument %s for option %s") % \
(`arg`, "'-P, --pause'"))
elif opt=="--profile":
do_profile = True
elif opt=="-q" or opt=="--quiet":
config["quiet"] = True
elif opt=="-r" or opt=="--recursion-level":
try:
depth = int(arg)
except ValueError:
printUsage(i18n._("Illegal argument %s for option %s") % \
(`arg`, "'-r, --recursion-level'"))
if depth >= 0:
config["recursionlevel"] = depth
else:
config["recursionlevel"] = -1
# robots.txt is now default, so ignore this option
elif opt=="-R" or opt=="--robots-txt": pass
elif opt=="-s" or opt=="--strict":
config["strict"] = True
elif opt=="-t" or opt=="--threads":
try:
num = int(arg)
except ValueError:
printUsage(i18n._("Illegal argument %s for option %s") % \
(`arg`, "'-t, --threads'"))
if num > 1 and not get_debuglevel() > 0:
config.enableThreading(num)
else:
config.disableThreading()
elif opt=="--timeout":
try:
timeout = int(arg)
except ValueError:
printUsage(i18n._("Illegal argument %s for option %s") % \
(`arg`, "'--timeout'"))
if timeout <= 0:
printUsage(i18n._("Illegal argument %s for option %s") % \
(`arg`, "'--timeout'"))
socket.setdefaulttimeout(timeout)
elif opt=="-u" or opt=="--user":
_user = arg
constructauth = True
elif opt=="-V" or opt=="--version":
printVersion()
elif opt=="-v" or opt=="--verbose":
config["verbose"] = True
config["warnings"] = True
elif opt=="--viewprof":
viewprof()
elif opt=="--wischiwaschi":
from linkcheck import util1
util1.abbuzze()
sys.exit(0)
elif opt=="-w" or opt=="--warnings":
config["warnings"] = True
elif opt=="-W" or opt=="--warning-regex":
config["warningregex"] = re.compile(arg)
config["warnings"] = True
elif opt=="-C" or opt=="--cookies":
config['cookies'] = True
if constructauth:
config["authentication"].insert(0, {'pattern': re.compile(".*"),
'user': _user,
'password': _password})
# construct the url list
# if we use blacklist mode, try to read ~/.blacklist
from linkcheck.log.BlacklistLogger import BlacklistLogger
if config["log"].__class__ == BlacklistLogger and \
os.path.exists(config['log'].filename):
args = file(config['log'].filename).readlines()
debug(HURT_ME_PLENTY, "configuration:", pprint.pformat(config))
# interactive input
if len(args)==0:
if config['interactive']:
urls = raw_input(i18n._("enter one or more urls, separated by white-space\n--> "))
args = urls.split()
else:
warn(i18n._("no files or urls given"))
# syntactic sugar
from linkcheck import UrlData
for url in args:
url = url.strip()
if ":" not in url:
if url.startswith("www."):
url = "http://%s"%url
elif url.startswith("ftp."):
url = "ftp://%s"%url
config.appendUrl(UrlData.GetUrlDataFrom(url, 0, config, cmdline=True))
############################# check the urls ################################
if do_profile:
import profile
profile.run("linkcheck.checkUrls(config)", _profile)
else:
linkcheck.checkUrls(config)
#############################################################################
# interactive input end
if config['interactive']:
raw_input(i18n._("Hit RETURN to finish"))