mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-25 10:20:23 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@52 e7d03fd6-7b0d-0410-9947-9c21f3af8025
274 lines
8.4 KiB
Python
Executable file
274 lines
8.4 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
import sys
|
|
|
|
if sys.version[:5] < "1.5.2":
|
|
print "This program requires Python 1.5.2 or later."
|
|
sys.exit(1)
|
|
|
|
# add the path to linkcheck module if you do not install with distutils
|
|
sys.path.append("/home/calvin/projects/linkchecker")
|
|
import getopt,re,string,linkcheck
|
|
|
|
Usage = """USAGE\tlinkchecker [options] file_or_url...
|
|
|
|
OPTIONS
|
|
-a, --anchors
|
|
Check anchor references. Default is don't check anchors.
|
|
-D, --debug
|
|
Print additional debugging information.
|
|
-e regex, --extern=regex
|
|
Assume urls that match the given expression as extern.
|
|
Only intern HTTP links are checked recursively.
|
|
-f file, --config=file
|
|
Use file as configuration file. Pylice first searches
|
|
~/.linkcheckerrc and then /etc/linkcheckerrc
|
|
(under Windows <path-to-program>\\linkcheckerrc).
|
|
-F name, --file-output=name
|
|
Same as output, but write to a file linkchecker-out.<name>.
|
|
If the file already exists, it is overwritten.
|
|
You can specify this option more than once.
|
|
Default is no file output.
|
|
-i regex, --intern=regex
|
|
Assume urls that match the given expression as intern.
|
|
-h, --help
|
|
Help me! Print usage information for this program.
|
|
-l, --allowdeny
|
|
Swap checking order to intern/extern. Default checking order
|
|
is extern/intern.
|
|
-N, --nntp-server
|
|
Specify an NNTP server for news: links. Default is the
|
|
environment variable NNTP_SERVER. If the variable is not defined,
|
|
only the syntax of the link is checked.
|
|
-o name, --output=name
|
|
Specify output as """+linkcheck.Config.LoggerKeys+""".
|
|
Default is text.
|
|
-p pwd, --password=pwd
|
|
Try given password for HTML and FTP authorization.
|
|
Default is 'guest@'. See -u.
|
|
-P host[:port], --proxy=host[:port]
|
|
Use specified proxy for HTTP requests.
|
|
Standard port is 8080. Default is to use no proxy.
|
|
-q, --quiet
|
|
Quiet operation. This is only useful with -F.
|
|
-r depth, --recursion-level=depth
|
|
Check recursively all links up to given depth (depth >= 0).
|
|
Default depth is 1.
|
|
-R, --robots-txt
|
|
Obey the robots exclusion standard.
|
|
-s, --strict
|
|
Check only syntax of extern links, do not try to connect to them.
|
|
-t num, --threads=num
|
|
Generate no more than num threads. Default number of threads is 5.
|
|
To disable threading specify a non-positive number.
|
|
-u name, --user=name
|
|
Try given username for HTML and FTP authorization.
|
|
Default is 'anonymous'. See -p.
|
|
-V, --version
|
|
Print version and exit.
|
|
-v, --verbose
|
|
Log all checked URLs (implies -w). Default is to log only invalid
|
|
URLs.
|
|
-w, --warnings
|
|
Log warnings.
|
|
-W, --warning-regex
|
|
Define a regular expression which prints a warning if it matches
|
|
any content of the checked link.
|
|
This applies of course only to pages which are valid, so we can
|
|
get their content.
|
|
You can use this to check for pages that contain some form of
|
|
error message, for example "This page has moved" or
|
|
"Oracle Application Server error".
|
|
This option implies -w.
|
|
"""
|
|
|
|
Notes = """NOTES
|
|
o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL
|
|
starts with "www." resp. "ftp.".
|
|
You can also give local files as arguments.
|
|
o If you have your system configured to automatically establish a
|
|
connection to the internet (e.g. with diald), it will connect when
|
|
checking links not pointing to your local host.
|
|
Use the -s and -i options to prevent this (see EXAMPLES).
|
|
o Javascript links are currently ignored
|
|
o If your platform does not support threading, linkchecker assumes -t0
|
|
o You can supply multiple user/password pairs in a configuration file
|
|
o Cookies are not accepted by LinkChecker.
|
|
"""
|
|
|
|
Examples = """EXAMPLES
|
|
o linkchecker -v -o html -r2 -s -i treasure.calvinsplayground.de \\
|
|
http://treasure.calvinsplayground.de/~calvin/ > sample.html
|
|
o Local files and syntactic sugar on the command line:
|
|
linkchecker c:\\temp\\test.html
|
|
linkchecker ../bla.html
|
|
linkchecker www.myhomepage.de
|
|
linkchecker -r0 ftp.linux.org
|
|
"""
|
|
|
|
def printVersion():
|
|
print linkcheck.Config.AppInfo
|
|
sys.exit(0)
|
|
|
|
def printHelp():
|
|
print Usage
|
|
print Notes
|
|
print Examples
|
|
sys.exit(0)
|
|
|
|
def printUsage(msg):
|
|
sys.stderr.write("Error: "+str(msg)+"\nType linkchecker -h for help\n")
|
|
sys.exit(1)
|
|
|
|
|
|
# Read command line arguments
|
|
try:
|
|
# Note: cut out the name of the script
|
|
options, args = getopt.getopt(sys.argv[1:],
|
|
"aDe:f:F:hi:lN:P:o:p:qr:Rst:u:VvwW:", # short options
|
|
["anchors", # long options
|
|
"config=",
|
|
"debug",
|
|
"extern=",
|
|
"file-output=",
|
|
"nntp-server=",
|
|
"help",
|
|
"intern=",
|
|
"allowdeny",
|
|
"output=",
|
|
"proxy=",
|
|
"password=",
|
|
"quiet",
|
|
"recursion-level=",
|
|
"robots-txt",
|
|
"strict",
|
|
"threads=",
|
|
"user=",
|
|
"version",
|
|
"verbose",
|
|
"warnings",
|
|
"warning-regex="])
|
|
except getopt.error:
|
|
type, value = sys.exc_info()[:2]
|
|
printUsage(value)
|
|
|
|
# apply configuration
|
|
config = linkcheck.Config.Configuration()
|
|
try:
|
|
configfiles = []
|
|
for opt,arg in options:
|
|
if opt=="-f" or opt=="--config":
|
|
configfiles.append(arg)
|
|
config.read(configfiles)
|
|
except:
|
|
type, value = sys.exc_info()[:2]
|
|
printUsage(value)
|
|
|
|
|
|
# apply options and arguments
|
|
_user = "anonymous"
|
|
_password = "guest@"
|
|
constructauth = 0
|
|
for opt,arg in options:
|
|
if opt=="-a" or opt=="--anchors":
|
|
config["anchors"] = 1
|
|
|
|
elif opt=="-D" or opt=="--debug":
|
|
linkcheck.Config.DebugFlag = 1
|
|
config.disableThreading()
|
|
|
|
elif opt=="-e" or opt=="--extern":
|
|
config["externlinks"].append((re.compile(arg), 0))
|
|
|
|
elif opt=="-h" or opt=="--help":
|
|
printHelp()
|
|
|
|
elif opt=="-o" or opt=="--output":
|
|
if linkcheck.Config.Loggers.has_key(arg):
|
|
config["log"] = linkcheck.Config.Loggers[arg]()
|
|
else:
|
|
printUsage("Legal output arguments are "+linkcheck.Config.LoggerKeys+".")
|
|
|
|
elif opt=="-F" or opt=="--file-output":
|
|
if linkcheck.Config.Loggers.has_key(arg):
|
|
config["fileoutput"].append(linkcheck.Config.Loggers[arg](open("linkchecker-out."+arg, "w")))
|
|
else:
|
|
printUsage("Legal output arguments are "+linkcheck.Config.LoggerKeys+".")
|
|
|
|
elif opt=="-i" or opt=="--intern":
|
|
config["internlinks"].append(re.compile(arg))
|
|
|
|
elif opt=="-l" or opt=="--allowdeny":
|
|
config["allowdeny"] = 1
|
|
|
|
elif opt=="-N" or opt=="--nntp-server":
|
|
config["nntpserver"] = arg
|
|
|
|
elif opt=="-P" or opt=="--proxy":
|
|
proxy = re.compile("(.+):(.+)").match(arg)
|
|
if proxy:
|
|
config["proxy"] = proxy.group(1)
|
|
config["proxyport"] = int(proxy.group(2))
|
|
else:
|
|
config["proxy"] = arg
|
|
|
|
elif opt=="-p" or opt=="--password":
|
|
_password = arg
|
|
constructauth = 1
|
|
|
|
elif opt=="-q" or opt=="--quiet":
|
|
config["quiet"] = 1
|
|
|
|
elif opt=="-r" or opt=="--recursion-level":
|
|
if int(arg) >= 0:
|
|
config["recursionlevel"] = int(arg)
|
|
else:
|
|
printUsage("Illegal recursion-level number: "+arg)
|
|
|
|
elif opt=="-R" or opt=="--robots-txt":
|
|
config["robotstxt"] = 1
|
|
|
|
elif opt=="-s" or opt=="--strict":
|
|
config["strict"] = 1
|
|
|
|
elif opt=="-t" or opt=="--threads":
|
|
num = int(arg)
|
|
if config["threads"] and not linkcheck.Config.DebugFlag:
|
|
if num>0:
|
|
config.enableThreading(num)
|
|
else:
|
|
config.disableThreading()
|
|
|
|
elif opt=="-u" or opt=="--user":
|
|
_user = arg
|
|
constructauth = 1
|
|
|
|
elif opt=="-V" or opt=="--version":
|
|
printVersion()
|
|
|
|
elif opt=="-v" or opt=="--verbose":
|
|
config["verbose"] = 1
|
|
config["warnings"] = 1
|
|
|
|
elif opt=="-w" or opt=="--warnings":
|
|
config["warnings"] = 1
|
|
|
|
elif opt=="-W" or opt=="--warning-regex":
|
|
config["warningregex"] = re.compile(arg)
|
|
config["warnings"] = 1
|
|
|
|
if constructauth:
|
|
congif["authentication"].insert(0, (re.compile(".*"), _user, _password))
|
|
|
|
if len(args)==0:
|
|
printUsage("no files or urls given")
|
|
|
|
for url in args:
|
|
if not (":" in url):
|
|
if re.compile("^ftp\.").match(url):
|
|
url = "ftp://"+url
|
|
elif re.compile("^www\.").match(url):
|
|
url = "http://"+url
|
|
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
|
|
|
|
linkcheck.checkUrls(config)
|