mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-21 16:30:28 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@642 e7d03fd6-7b0d-0410-9947-9c21f3af8025
396 lines
13 KiB
Python
Executable file
396 lines
13 KiB
Python
Executable file
#!/usr/bin/python -O
|
|
"""check HTML pages for broken links"""
|
|
# Copyright (C) 2000-2002 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
# imports and checks
|
|
import sys
|
|
if not hasattr(sys, 'version_info') or sys.version_info<(2, 2, 1, 'final', 0):
|
|
raise SystemExit, "This program requires Python 2.2.1 or later."
|
|
|
|
import getopt, re, os, urlparse, pprint, linkcheck
|
|
import linkcheck.timeoutsocket
|
|
# set default 30 seconds timeout
|
|
linkcheck.timeoutsocket.setDefaultSocketTimeout(30)
|
|
# import several helper debugging things
|
|
from linkcheck.debuglevels import *
|
|
from linkcheck import StringUtil
|
|
debug = linkcheck.debug
|
|
_profile = "linkchecker.prof"
|
|
|
|
# main usage text
|
|
Usage = linkcheck._("""USAGE\tlinkchecker [options] file-or-url...
|
|
|
|
OPTIONS
|
|
For single-letter option arguments the space is not a necessity. So
|
|
'-o colored' is the same as '-ocolored'.
|
|
-a, --anchors
|
|
Check HTTP anchor references. Default is don't check anchors.
|
|
-C, --cookies
|
|
Accept and send HTTP cookies according to RFC 2109. Only cookies
|
|
which are sent back to the originating server are accepted.
|
|
Sent and accepted cookies are provided as additional logging
|
|
information.
|
|
-d, --denyallow
|
|
Swap checking order to extern/intern. Default checking order
|
|
is intern/extern.
|
|
-D, --debug
|
|
Print debugging information. Provide this option multiple times
|
|
for even more debugging information.
|
|
-e regex, --extern=regex
|
|
Assume urls that match the given expression as extern.
|
|
Only intern HTML links are checked recursively.
|
|
-f file, --config=file
|
|
Use file as configuration file. As default LinkChecker first
|
|
searches /etc/linkcheckerrc and then ~/.linkcheckerrc
|
|
(under Windows <path-to-program>\\linkcheckerrc).
|
|
-F type[/filename], --file-output=type[/filename]
|
|
Same as -o, but write to a file linkchecker-out.<type>
|
|
or <filename> if specified. If the file already exists, it
|
|
is overwritten. You can specify this option more than once.
|
|
There is no file output for the blacklist logger. Default is
|
|
no file output.
|
|
-I, --interactive
|
|
Ask for url if none are given on the commandline.
|
|
-i regex, --intern=regex
|
|
Assume URLs that match the given expression as intern.
|
|
LinkChecker descends recursively only to intern URLs, not to extern.
|
|
-h, --help
|
|
Help me! Print usage information for this program.
|
|
-N server, --nntp-server=server
|
|
Specify an NNTP server for 'news:...' links. Default is the
|
|
environment variable NNTP_SERVER. If no host is given,
|
|
only the syntax of the link is checked.
|
|
-o type, --output=type
|
|
Specify output type as %s.
|
|
Default type is text.
|
|
-p pwd, --password=pwd
|
|
Try password pwd for HTTP and FTP authorization.
|
|
Default password is 'joe@'. See also -u.
|
|
-P secs, --pause=secs
|
|
Pause <secs> seconds between each url check. This option
|
|
implies -t0.
|
|
Default is no pause between requests.
|
|
--profile
|
|
Write profiling data into a file named %s in the
|
|
current working directory.
|
|
See also --viewprof.
|
|
-q, --quiet
|
|
Quiet operation. This is only useful with -F.
|
|
-r depth, --recursion-level=depth
|
|
Check recursively all links up to given depth (depth >= 0).
|
|
Default depth is 1.
|
|
-s, --strict
|
|
Check only syntax of extern links, do not try to connect to them.
|
|
-t num, --threads=num
|
|
Generate no more than num threads. Default number of threads is 5.
|
|
To disable threading specify a non-positive number.
|
|
--timeout=secs
|
|
Set the timeout for TCP connection attempts in seconds. The default
|
|
timeout is 10 seconds.
|
|
-u name, --user=name
|
|
Try username name for HTTP and FTP authorization.
|
|
Default is 'anonymous'. See also -p.
|
|
-V, --version
|
|
Print version and exit.
|
|
-v, --verbose
|
|
Log all checked URLs (implies -w). Default is to log only invalid
|
|
URLs.
|
|
--viewprof
|
|
Print out previously generated profiling data. See also --profile.
|
|
-w, --warnings
|
|
Log warnings.
|
|
-W regex, --warning-regex=regex
|
|
Define a regular expression which prints a warning if it matches
|
|
any content of the checked link.
|
|
This applies of course only to pages which are valid, so we can
|
|
get their content.
|
|
Use this to check for pages that contain some form of error
|
|
message, for example 'This page has moved' or 'Oracle
|
|
Application Server error'.
|
|
This option implies -w.
|
|
""") % (linkcheck.log.LoggerKeys, _profile)
|
|
|
|
Notes = linkcheck._("""NOTES
|
|
o LinkCheckers commandline parser treats "ftp." links like "ftp://ftp."
|
|
and "www." links like "http://www.".
|
|
You can also give local files as arguments.
|
|
o If you have your system configured to automatically establish a
|
|
connection to the internet (e.g. with diald), it will connect when
|
|
checking links not pointing to your local host.
|
|
Use the -s and -i options to prevent this.
|
|
o Javascript links are currently ignored.
|
|
o If your platform does not support threading, LinkChecker uses -t0.
|
|
o You can supply multiple user/password pairs in a configuration file.
|
|
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
|
|
on Unix or Windows.
|
|
On a Mac use the Internet Config.
|
|
o When checking 'news:' links the given NNTP host doesn't need to be the
|
|
same as the host of the user browsing your pages!
|
|
""")
|
|
|
|
Examples = linkcheck._("""EXAMPLES
|
|
o linkchecker -v -ohtml -r2 -s -itreasure.calvinsplayground.de \\
|
|
http://treasure.calvinsplayground.de/~calvin/ > sample.html
|
|
o Local files and syntactic sugar on the command line:
|
|
linkchecker c:\\temp\\test.html
|
|
linkchecker ../bla.html
|
|
linkchecker www.myhomepage.de
|
|
linkchecker -r0 ftp.linux.org
|
|
""")
|
|
|
|
def printVersion ():
|
|
"""print the program version and exit"""
|
|
print linkcheck.Config.AppInfo
|
|
sys.exit(0)
|
|
|
|
def printHelp ():
|
|
"""print the program help text"""
|
|
if os.name!='posix':
|
|
StringUtil.paginate(Usage+"\n"+Notes+"\n"+Examples)
|
|
else:
|
|
print Usage
|
|
print Notes
|
|
print Examples
|
|
sys.exit(0)
|
|
|
|
def printUsage (msg):
|
|
"""print a program msg text to stderr"""
|
|
sys.stderr.write(linkcheck._("Error: %s\n") % msg)
|
|
sys.stderr.write(linkcheck._("Execute 'linkchecker -h' for help\n"))
|
|
sys.exit(1)
|
|
|
|
|
|
def viewprof ():
|
|
if not os.path.exists(_profile):
|
|
sys.stderr.write(linkcheck._("Could not find profiling file %s.")%_profile)
|
|
sys.stderr.write(linkcheck._("Please run linkchecker with --profile to generate it."))
|
|
sys.exit(1)
|
|
import pstats
|
|
stats = pstats.Stats(_profile)
|
|
stats.strip_dirs().sort_stats("cumulative").print_stats(50)
|
|
sys.exit(0)
|
|
|
|
# Read command line arguments
|
|
try:
|
|
# Note: cut out the name of the script
|
|
options, args = getopt.getopt(sys.argv[1:],
|
|
"adCDe:f:F:hIi:N:o:p:P:qr:Rst:u:VvwW:", # short options
|
|
["anchors", # long options
|
|
"config=",
|
|
"cookies",
|
|
"debug",
|
|
"extern=",
|
|
"file-output=",
|
|
"nntp-server=",
|
|
"help",
|
|
"interactive",
|
|
"intern=",
|
|
"denyallow",
|
|
"output=",
|
|
"password=",
|
|
"pause=",
|
|
"profile",
|
|
"quiet",
|
|
"recursion-level=",
|
|
"wischiwaschi",
|
|
"robots-txt",
|
|
"strict",
|
|
"threads=",
|
|
"timeout=",
|
|
"user=",
|
|
"version",
|
|
"verbose",
|
|
"viewprof",
|
|
"warnings",
|
|
"warning-regex="])
|
|
except getopt.error:
|
|
type, value = sys.exc_info()[:2]
|
|
printUsage(value)
|
|
|
|
# set debug level as early as possible
|
|
for opt,arg in options:
|
|
if opt=="-D" or opt=="--debug":
|
|
linkcheck.Config.DebugLevel += 1
|
|
debug(BRING_IT_ON, "Python", sys.version, "on", sys.platform)
|
|
# read configuration from config files
|
|
config = linkcheck.Config.Configuration()
|
|
configfiles = []
|
|
for opt,arg in options:
|
|
if opt=="-f" or opt=="--config":
|
|
configfiles.append(arg)
|
|
config.read(configfiles)
|
|
# disable threading for debugging
|
|
if linkcheck.Config.DebugLevel > 0:
|
|
config.disableThreading()
|
|
# apply commandline options and arguments
|
|
_user = "anonymous"
|
|
_password = "guest@"
|
|
constructauth = 0
|
|
do_profile = 0
|
|
for opt,arg in options:
|
|
if opt=="-a" or opt=="--anchors":
|
|
config["anchors"] = 1
|
|
|
|
elif opt=="-e" or opt=="--extern":
|
|
config["externlinks"].append(linkcheck.getLinkPat(arg))
|
|
|
|
elif opt=="-h" or opt=="--help":
|
|
printHelp()
|
|
|
|
elif opt=="-o" or opt=="--output":
|
|
if linkcheck.log.Loggers.has_key(arg):
|
|
config['log'] = config.newLogger(arg)
|
|
else:
|
|
printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +\
|
|
"'-o, --output'")
|
|
|
|
elif opt=="-F" or opt=="--file-output":
|
|
ns = {'fileoutput':1}
|
|
try:
|
|
type, ns['filename'] = arg.split('/', 1)
|
|
if not ns['filename']: raise ValueError
|
|
except ValueError: type = arg
|
|
if linkcheck.log.Loggers.has_key(type) and type != "blacklist":
|
|
config['fileoutput'].append(config.newLogger(type, ns))
|
|
else:
|
|
printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +\
|
|
"'-F, --file-output'")
|
|
|
|
elif opt=="-I" or opt=="--interactive":
|
|
config['interactive'] = 1
|
|
|
|
elif opt=="-i" or opt=="--intern":
|
|
config["internlinks"].append(linkcheck.getLinkPat(arg))
|
|
|
|
elif opt=="-l" or opt=="--denyallow":
|
|
config["denyallow"] = 1
|
|
|
|
elif opt=="-N" or opt=="--nntp-server":
|
|
config["nntpserver"] = arg
|
|
|
|
elif opt=="-p" or opt=="--password":
|
|
_password = arg
|
|
constructauth = 1
|
|
|
|
elif opt=="-P" or opt=="--pause":
|
|
if int(arg) >= 0:
|
|
config["wait"] = int(arg)
|
|
else:
|
|
printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +
|
|
"'-P, --pause'")
|
|
|
|
elif opt=="--profile":
|
|
do_profile = 1
|
|
|
|
elif opt=="-q" or opt=="--quiet":
|
|
config["quiet"] = 1
|
|
|
|
elif opt=="-r" or opt=="--recursion-level":
|
|
if int(arg) >= 0:
|
|
config["recursionlevel"] = int(arg)
|
|
else:
|
|
printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +
|
|
"'-r, --recursion-level'")
|
|
# robots.txt is now default, so ignore this option
|
|
elif opt=="-R" or opt=="--robots-txt": pass
|
|
|
|
elif opt=="-s" or opt=="--strict":
|
|
config["strict"] = 1
|
|
|
|
elif opt=="-t" or opt=="--threads":
|
|
num = int(arg)
|
|
if config["threads"] and not linkcheck.Config.DebugLevel:
|
|
if num>1:
|
|
config.enableThreading(num)
|
|
else:
|
|
config.disableThreading()
|
|
|
|
elif opt=="--timeout":
|
|
linkcheck.timeoutsocket.setDefaultSocketTimeout(int(arg))
|
|
|
|
elif opt=="-u" or opt=="--user":
|
|
_user = arg
|
|
constructauth = 1
|
|
|
|
elif opt=="-V" or opt=="--version":
|
|
printVersion()
|
|
|
|
elif opt=="-v" or opt=="--verbose":
|
|
config["verbose"] = 1
|
|
config["warnings"] = 1
|
|
|
|
elif opt=="--viewprof":
|
|
viewprof()
|
|
|
|
elif opt=="--wischiwaschi":
|
|
from linkcheck import util1
|
|
util1.abbuzze()
|
|
sys.exit(0)
|
|
elif opt=="-w" or opt=="--warnings":
|
|
config["warnings"] = 1
|
|
|
|
elif opt=="-W" or opt=="--warning-regex":
|
|
config["warningregex"] = re.compile(arg)
|
|
config["warnings"] = 1
|
|
|
|
elif opt=="-C" or opt=="--cookies":
|
|
config['cookies'] = 1
|
|
|
|
if constructauth:
|
|
config["authentication"].insert(0, {'pattern': re.compile(".*"),
|
|
'user': _user,
|
|
'password': _password})
|
|
|
|
# construct the url list
|
|
# if we use blacklist mode, try to read ~/.blacklist
|
|
from linkcheck.log.BlacklistLogger import BlacklistLogger
|
|
if config["log"].__class__ == BlacklistLogger and \
|
|
os.path.exists(config['log'].filename):
|
|
args = open(config['log'].filename).readlines()
|
|
|
|
debug(HURT_ME_PLENTY, "configuration:", pprint.pformat(config.data))
|
|
|
|
# interactive input
|
|
if len(args)==0:
|
|
if config['interactive']:
|
|
urls = raw_input(linkcheck._("enter one or more urls, separated by white-space\n--> "))
|
|
args = urls.split()
|
|
else:
|
|
config.warn(linkcheck._("no files or urls given"))
|
|
|
|
# syntactic sugar
|
|
from linkcheck import UrlData
|
|
for url in args:
|
|
url = url.strip()
|
|
if not (":" in url):
|
|
if url.startswith("ftp."):
|
|
url = "ftp://"+url
|
|
elif url.startswith("www."):
|
|
url = "http://"+url
|
|
config.appendUrl(UrlData.GetUrlDataFrom(url, 0, config))
|
|
|
|
############################# check the urls ################################
|
|
if do_profile:
|
|
import profile
|
|
profile.run("linkcheck.checkUrls(config)", _profile)
|
|
else:
|
|
linkcheck.checkUrls(config)
|
|
#############################################################################
|
|
|
|
# interactive input end
|
|
if config['interactive']:
|
|
raw_input(linkcheck._("Hit RETURN to finish"))
|