linkchecker/linkchecker
2000-03-30 17:10:35 +00:00

274 lines
8.4 KiB
Python
Executable file

#!/usr/bin/env python
import sys
if sys.version[:5] < "1.5.2":
print "This program requires Python 1.5.2 or later."
sys.exit(1)
# add the path to linkcheck module if you do not install with distutils
sys.path.append("/home/calvin/projects/linkchecker")
import getopt,re,string,linkcheck
Usage = """USAGE\tlinkchecker [options] file_or_url...
OPTIONS
-a, --anchors
Check anchor references. Default is don't check anchors.
-D, --debug
Print additional debugging information.
-e regex, --extern=regex
Assume urls that match the given expression as extern.
Only intern HTTP links are checked recursively.
-f file, --config=file
Use file as configuration file. Pylice first searches
~/.linkcheckerrc and then /etc/linkcheckerrc
(under Windows <path-to-program>\\linkcheckerrc).
-F name, --file-output=name
Same as output, but write to a file linkchecker-out.<name>.
If the file already exists, it is overwritten.
You can specify this option more than once.
Default is no file output.
-i regex, --intern=regex
Assume urls that match the given expression as intern.
-h, --help
Help me! Print usage information for this program.
-l, --allowdeny
Swap checking order to intern/extern. Default checking order
is extern/intern.
-N, --nntp-server
Specify an NNTP server for news: links. Default is the
environment variable NNTP_SERVER. If the variable is not defined,
only the syntax of the link is checked.
-o name, --output=name
Specify output as """+linkcheck.Config.LoggerKeys+""".
Default is text.
-p pwd, --password=pwd
Try given password for HTML and FTP authorization.
Default is 'guest@'. See -u.
-P host[:port], --proxy=host[:port]
Use specified proxy for HTTP requests.
Standard port is 8080. Default is to use no proxy.
-q, --quiet
Quiet operation. This is only useful with -F.
-r depth, --recursion-level=depth
Check recursively all links up to given depth (depth >= 0).
Default depth is 1.
-R, --robots-txt
Obey the robots exclusion standard.
-s, --strict
Check only syntax of extern links, do not try to connect to them.
-t num, --threads=num
Generate no more than num threads. Default number of threads is 5.
To disable threading specify a non-positive number.
-u name, --user=name
Try given username for HTML and FTP authorization.
Default is 'anonymous'. See -p.
-V, --version
Print version and exit.
-v, --verbose
Log all checked URLs (implies -w). Default is to log only invalid
URLs.
-w, --warnings
Log warnings.
-W, --warning-regex
Define a regular expression which prints a warning if it matches
any content of the checked link.
This applies of course only to pages which are valid, so we can
get their content.
You can use this to check for pages that contain some form of
error message, for example "This page has moved" or
"Oracle Application Server error".
This option implies -w.
"""
Notes = """NOTES
o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL
starts with "www." resp. "ftp.".
You can also give local files as arguments.
o If you have your system configured to automatically establish a
connection to the internet (e.g. with diald), it will connect when
checking links not pointing to your local host.
Use the -s and -i options to prevent this (see EXAMPLES).
o Javascript links are currently ignored
o If your platform does not support threading, linkchecker assumes -t0
o You can supply multiple user/password pairs in a configuration file
o Cookies are not accepted by LinkChecker.
"""
Examples = """EXAMPLES
o linkchecker -v -o html -r2 -s -i treasure.calvinsplayground.de \\
http://treasure.calvinsplayground.de/~calvin/ > sample.html
o Local files and syntactic sugar on the command line:
linkchecker c:\\temp\\test.html
linkchecker ../bla.html
linkchecker www.myhomepage.de
linkchecker -r0 ftp.linux.org
"""
def printVersion():
print linkcheck.Config.AppInfo
sys.exit(0)
def printHelp():
print Usage
print Notes
print Examples
sys.exit(0)
def printUsage(msg):
sys.stderr.write("Error: "+str(msg)+"\nType linkchecker -h for help\n")
sys.exit(1)
# Read command line arguments
try:
# Note: cut out the name of the script
options, args = getopt.getopt(sys.argv[1:],
"aDe:f:F:hi:lN:P:o:p:qr:Rst:u:VvwW:", # short options
["anchors", # long options
"config=",
"debug",
"extern=",
"file-output=",
"nntp-server=",
"help",
"intern=",
"allowdeny",
"output=",
"proxy=",
"password=",
"quiet",
"recursion-level=",
"robots-txt",
"strict",
"threads=",
"user=",
"version",
"verbose",
"warnings",
"warning-regex="])
except getopt.error:
type, value = sys.exc_info()[:2]
printUsage(value)
# apply configuration
config = linkcheck.Config.Configuration()
try:
configfiles = []
for opt,arg in options:
if opt=="-f" or opt=="--config":
configfiles.append(arg)
config.read(configfiles)
except:
type, value = sys.exc_info()[:2]
printUsage(value)
# apply options and arguments
_user = "anonymous"
_password = "guest@"
constructauth = 0
for opt,arg in options:
if opt=="-a" or opt=="--anchors":
config["anchors"] = 1
elif opt=="-D" or opt=="--debug":
linkcheck.Config.DebugFlag = 1
config.disableThreading()
elif opt=="-e" or opt=="--extern":
config["externlinks"].append((re.compile(arg), 0))
elif opt=="-h" or opt=="--help":
printHelp()
elif opt=="-o" or opt=="--output":
if linkcheck.Config.Loggers.has_key(arg):
config["log"] = linkcheck.Config.Loggers[arg]()
else:
printUsage("Legal output arguments are "+linkcheck.Config.LoggerKeys+".")
elif opt=="-F" or opt=="--file-output":
if linkcheck.Config.Loggers.has_key(arg):
config["fileoutput"].append(linkcheck.Config.Loggers[arg](open("linkchecker-out."+arg, "w")))
else:
printUsage("Legal output arguments are "+linkcheck.Config.LoggerKeys+".")
elif opt=="-i" or opt=="--intern":
config["internlinks"].append(re.compile(arg))
elif opt=="-l" or opt=="--allowdeny":
config["allowdeny"] = 1
elif opt=="-N" or opt=="--nntp-server":
config["nntpserver"] = arg
elif opt=="-P" or opt=="--proxy":
proxy = re.compile("(.+):(.+)").match(arg)
if proxy:
config["proxy"] = proxy.group(1)
config["proxyport"] = int(proxy.group(2))
else:
config["proxy"] = arg
elif opt=="-p" or opt=="--password":
_password = arg
constructauth = 1
elif opt=="-q" or opt=="--quiet":
config["quiet"] = 1
elif opt=="-r" or opt=="--recursion-level":
if int(arg) >= 0:
config["recursionlevel"] = int(arg)
else:
printUsage("Illegal recursion-level number: "+arg)
elif opt=="-R" or opt=="--robots-txt":
config["robotstxt"] = 1
elif opt=="-s" or opt=="--strict":
config["strict"] = 1
elif opt=="-t" or opt=="--threads":
num = int(arg)
if config["threads"] and not linkcheck.Config.DebugFlag:
if num>0:
config.enableThreading(num)
else:
config.disableThreading()
elif opt=="-u" or opt=="--user":
_user = arg
constructauth = 1
elif opt=="-V" or opt=="--version":
printVersion()
elif opt=="-v" or opt=="--verbose":
config["verbose"] = 1
config["warnings"] = 1
elif opt=="-w" or opt=="--warnings":
config["warnings"] = 1
elif opt=="-W" or opt=="--warning-regex":
config["warningregex"] = re.compile(arg)
config["warnings"] = 1
if constructauth:
congif["authentication"].insert(0, (re.compile(".*"), _user, _password))
if len(args)==0:
printUsage("no files or urls given")
for url in args:
if not (":" in url):
if re.compile("^ftp\.").match(url):
url = "ftp://"+url
elif re.compile("^www\.").match(url):
url = "http://"+url
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
linkcheck.checkUrls(config)