linkchecker/linkchecker
2000-12-14 22:53:48 +00:00

298 lines
10 KiB
Python
Executable file

#!/usr/bin/env python
# Copyright (C) 2000 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# imports and checks
import sys
if sys.version[:5] < "1.5.2":
raise SystemExit, "This program requires Python 1.5.2 or later."
import getopt,re,string,os,urlparse
# 90 seconds timeout for all connections
#import timeoutsocket
#timeoutsocket.setDefaultSocketTimeout(90)
import linkcheck
from linkcheck import _,StringUtil
Usage = _("USAGE\tlinkchecker [options] file-or-url...\n"
"\n"
"OPTIONS\n"
"For single-letter option arguments the space is not a necessity. So\n"
"'-o colored' is the same as '-ocolored'.\n"
"-a, --anchors\n"
" Check anchor references. Default is don't check anchors.\n"
"-d, --denyallow\n"
" Swap checking order to extern/intern. Default checking order\n"
" is intern/extern.\n"
"-D, --debug\n"
" Print additional debugging information.\n"
"-e regex, --extern=regex\n"
" Assume urls that match the given expression as extern.\n"
" Only intern HTML links are checked recursively.\n"
"-f file, --config=file\n"
" Use file as configuration file. LinkChecker first searches\n"
" ~/.linkcheckerrc and then /etc/linkcheckerrc\n"
" (under Windows <path-to-program>\\linkcheckerrc).\n"
"-F type, --file-output=type\n"
" Same as output, but write to a file linkchecker-out.<type>.\n"
" If the file already exists, it is overwritten. You can specify\n"
" this option more than once. There is no file output for the\n"
" blacklist logger. Default is no file output.\n"
"-i regex, --intern=regex\n"
" Assume URLs that match the given expression as intern.\n"
" LinkChecker descends recursively only to intern URLs, not to extern.\n"
"-h, --help\n"
" Help me! Print usage information for this program.\n"
"-N server, --nntp-server=server\n"
" Specify an NNTP server for 'news:...' links. Default is the\n"
" environment variable NNTP_SERVER. If no host is given,\n"
" only the syntax of the link is checked.\n"
"-o type, --output=type\n"
" Specify output type as %s.\n"
" Default type is text.\n"
"-p pwd, --password=pwd\n"
" Try password pwd for HTML and FTP authorization.\n"
" Default password is 'joe@'. See also -u.\n"
"-q, --quiet\n"
" Quiet operation. This is only useful with -F.\n"
"-r depth, --recursion-level=depth\n"
" Check recursively all links up to given depth (depth >= 0).\n"
" Default depth is 1.\n"
"-R, --robots-txt\n"
" Obey the robots exclusion standard.\n"
"-s, --strict\n"
" Check only syntax of extern links, do not try to connect to them.\n"
"-t num, --threads=num\n"
" Generate no more than num threads. Default number of threads is 5.\n"
" To disable threading specify a non-positive number.\n"
"-u name, --user=name\n"
" Try username name for HTML and FTP authorization.\n"
" Default is 'anonymous'. See also -p.\n"
"-V, --version\n"
" Print version and exit.\n"
"-v, --verbose\n"
" Log all checked URLs (implies -w). Default is to log only invalid\n"
" URLs.\n"
"-w, --warnings\n"
" Log warnings.\n"
"-W regex, --warning-regex=regex\n"
" Define a regular expression which prints a warning if it matches\n"
" any content of the checked link.\n"
" This applies of course only to pages which are valid, so we can\n"
" get their content.\n"
" Use this to check for pages that contain some form of error\n"
" message, for example 'This page has moved' or 'Oracle\n"
" Application Server error'.\n"
" This option implies -w.\n") % linkcheck.Config.LoggerKeys
Notes = _("NOTES\n"
"o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL\n"
" starts with 'www.' resp. 'ftp.'\n"
" You can also give local files as arguments.\n"
"o If you have your system configured to automatically establish a\n"
" connection to the internet (e.g. with diald), it will connect when\n"
" checking links not pointing to your local host.\n"
" Use the -s and -i options to prevent this.\n"
"o Javascript links are currently ignored.\n"
"o If your platform does not support threading, LinkChecker uses -t0.\n"
"o You can supply multiple user/password pairs in a configuration file.\n"
"o Cookies are not accepted by LinkChecker.\n"
"o To use proxies set $http_proxy, $https_proxy on Unix or Windows.\n"
" On a Mac use the Internet Config.\n"
"o When checking 'news:' links the given NNTP host doesn't need to be the\n"
" same as the host of the user browsing your pages!\n")
Examples = _("EXAMPLES\n"
"o linkchecker -v -ohtml -r2 -s -itreasure.calvinsplayground.de \\\n"
" http://treasure.calvinsplayground.de/~calvin/ > sample.html\n"
"o Local files and syntactic sugar on the command line:\n"
" linkchecker c:\\temp\\test.html\n"
" linkchecker ../bla.html\n"
" linkchecker www.myhomepage.de\n"
" linkchecker -r0 ftp.linux.org\n")
def printVersion():
print linkcheck.Config.AppInfo
sys.exit(0)
def printHelp():
if os.name!='posix':
StringUtil.paginate(Usage+"\n"+Notes+"\n"+Examples)
else:
print Usage
print Notes
print Examples
sys.exit(0)
def printUsage(msg):
sys.stderr.write(_("Error: %s\n") % msg)
sys.stderr.write(_("Execute 'linkchecker -h' for help\n"))
sys.exit(1)
# Read command line arguments
try:
# Note: cut out the name of the script
options, args = getopt.getopt(sys.argv[1:],
"adDe:f:F:hi:N:o:p:qr:Rst:u:VvwW:", # short options
["anchors", # long options
"config=",
"debug",
"extern=",
"file-output=",
"nntp-server=",
"help",
"intern=",
"denyallow",
"output=",
"password=",
"quiet",
"recursion-level=",
"wischiwaschi",
"robots-txt",
"strict",
"threads=",
"user=",
"version",
"verbose",
"warnings",
"warning-regex="])
except getopt.error:
type, value = sys.exc_info()[:2]
printUsage(value)
# apply configuration
config = linkcheck.Config.Configuration()
configfiles = []
for opt,arg in options:
if opt=="-f" or opt=="--config":
configfiles.append(arg)
elif opt=="-D" or opt=="--debug":
linkcheck.Config.DebugFlag = 1
config.disableThreading()
config.read(configfiles)
# apply options and arguments
_user = "anonymous"
_password = "guest@"
constructauth = 0
for opt,arg in options:
if opt=="-a" or opt=="--anchors":
config["anchors"] = 1
elif opt=="-e" or opt=="--extern":
config["externlinks"].append((re.compile(arg), 0))
elif opt=="-h" or opt=="--help":
printHelp()
elif opt=="-o" or opt=="--output":
if linkcheck.Config.Loggers.has_key(arg):
config['log'] = config.newLogger(arg)
else:
printUsage((_("Illegal argument '%s' for option ") % arg) +\
"'-o, --output'")
elif opt=="-F" or opt=="--file-output":
if linkcheck.Config.Loggers.has_key(arg) and arg != "blacklist":
config['fileoutput'].append(
config.newLogger(arg, {'fileoutput':1}))
else:
printUsage((_("Illegal argument '%s' for option ") % arg) +\
"'-F, --file-output'")
elif opt=="-i" or opt=="--intern":
config["internlinks"].append(re.compile(arg))
elif opt=="-l" or opt=="--denyallow":
config["denyallow"] = 1
elif opt=="-N" or opt=="--nntp-server":
config["nntpserver"] = arg
elif opt=="-p" or opt=="--password":
_password = arg
constructauth = 1
elif opt=="-q" or opt=="--quiet":
config["quiet"] = 1
elif opt=="-r" or opt=="--recursion-level":
if int(arg) >= 0:
config["recursionlevel"] = int(arg)
else:
printUsage((_("Illegal argument '%s' for option ") % arg) +
"'-r, --recursion-level'")
elif opt=="-R" or opt=="--robots-txt":
config["robotstxt"] = 1
elif opt=="-s" or opt=="--strict":
config["strict"] = 1
elif opt=="-t" or opt=="--threads":
num = int(arg)
if config["threads"] and not linkcheck.Config.DebugFlag:
if num>0:
config.enableThreading(num)
else:
config.disableThreading()
elif opt=="-u" or opt=="--user":
_user = arg
constructauth = 1
elif opt=="-V" or opt=="--version":
printVersion()
elif opt=="-v" or opt=="--verbose":
config["verbose"] = 1
config["warnings"] = 1
elif opt=="--wischiwaschi":
from linkcheck import util1
util1.abbuzze()
sys.exit(0)
elif opt=="-w" or opt=="--warnings":
config["warnings"] = 1
elif opt=="-W" or opt=="--warning-regex":
config["warningregex"] = re.compile(arg)
config["warnings"] = 1
if constructauth:
config["authentication"].insert(0, (re.compile(".*"), _user, _password))
# construct the url list
# if we use blacklist mode, try to read ~/.blacklist
if config["log"].__class__ == linkcheck.Logging.BlacklistLogger and \
os.path.exists(config['log'].filename):
args = open(config['log'].filename).readlines()
if len(args)==0:
print _("warning: no files or urls given")
for url in args:
url = string.strip(url)
if not (":" in url):
if re.compile("^ftp\.").match(url):
url = "ftp://"+url
elif re.compile("^www\.").match(url):
url = "http://"+url
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
# check the urls
linkcheck.checkUrls(config)