mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-22 17:00:25 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@209 e7d03fd6-7b0d-0410-9947-9c21f3af8025
298 lines
10 KiB
Python
Executable file
298 lines
10 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# Copyright (C) 2000 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
# imports and checks
|
|
import sys
|
|
if sys.version[:5] < "1.5.2":
|
|
raise SystemExit, "This program requires Python 1.5.2 or later."
|
|
import getopt,re,string,os,urlparse
|
|
# 90 seconds timeout for all connections
|
|
#import timeoutsocket
|
|
#timeoutsocket.setDefaultSocketTimeout(90)
|
|
import linkcheck
|
|
from linkcheck import _,StringUtil
|
|
|
|
|
|
Usage = _("USAGE\tlinkchecker [options] file-or-url...\n"
|
|
"\n"
|
|
"OPTIONS\n"
|
|
"For single-letter option arguments the space is not a necessity. So\n"
|
|
"'-o colored' is the same as '-ocolored'.\n"
|
|
"-a, --anchors\n"
|
|
" Check anchor references. Default is don't check anchors.\n"
|
|
"-d, --denyallow\n"
|
|
" Swap checking order to extern/intern. Default checking order\n"
|
|
" is intern/extern.\n"
|
|
"-D, --debug\n"
|
|
" Print additional debugging information.\n"
|
|
"-e regex, --extern=regex\n"
|
|
" Assume urls that match the given expression as extern.\n"
|
|
" Only intern HTML links are checked recursively.\n"
|
|
"-f file, --config=file\n"
|
|
" Use file as configuration file. LinkChecker first searches\n"
|
|
" ~/.linkcheckerrc and then /etc/linkcheckerrc\n"
|
|
" (under Windows <path-to-program>\\linkcheckerrc).\n"
|
|
"-F type, --file-output=type\n"
|
|
" Same as output, but write to a file linkchecker-out.<type>.\n"
|
|
" If the file already exists, it is overwritten. You can specify\n"
|
|
" this option more than once. There is no file output for the\n"
|
|
" blacklist logger. Default is no file output.\n"
|
|
"-i regex, --intern=regex\n"
|
|
" Assume URLs that match the given expression as intern.\n"
|
|
" LinkChecker descends recursively only to intern URLs, not to extern.\n"
|
|
"-h, --help\n"
|
|
" Help me! Print usage information for this program.\n"
|
|
"-N server, --nntp-server=server\n"
|
|
" Specify an NNTP server for 'news:...' links. Default is the\n"
|
|
" environment variable NNTP_SERVER. If no host is given,\n"
|
|
" only the syntax of the link is checked.\n"
|
|
"-o type, --output=type\n"
|
|
" Specify output type as %s.\n"
|
|
" Default type is text.\n"
|
|
"-p pwd, --password=pwd\n"
|
|
" Try password pwd for HTML and FTP authorization.\n"
|
|
" Default password is 'joe@'. See also -u.\n"
|
|
"-q, --quiet\n"
|
|
" Quiet operation. This is only useful with -F.\n"
|
|
"-r depth, --recursion-level=depth\n"
|
|
" Check recursively all links up to given depth (depth >= 0).\n"
|
|
" Default depth is 1.\n"
|
|
"-R, --robots-txt\n"
|
|
" Obey the robots exclusion standard.\n"
|
|
"-s, --strict\n"
|
|
" Check only syntax of extern links, do not try to connect to them.\n"
|
|
"-t num, --threads=num\n"
|
|
" Generate no more than num threads. Default number of threads is 5.\n"
|
|
" To disable threading specify a non-positive number.\n"
|
|
"-u name, --user=name\n"
|
|
" Try username name for HTML and FTP authorization.\n"
|
|
" Default is 'anonymous'. See also -p.\n"
|
|
"-V, --version\n"
|
|
" Print version and exit.\n"
|
|
"-v, --verbose\n"
|
|
" Log all checked URLs (implies -w). Default is to log only invalid\n"
|
|
" URLs.\n"
|
|
"-w, --warnings\n"
|
|
" Log warnings.\n"
|
|
"-W regex, --warning-regex=regex\n"
|
|
" Define a regular expression which prints a warning if it matches\n"
|
|
" any content of the checked link.\n"
|
|
" This applies of course only to pages which are valid, so we can\n"
|
|
" get their content.\n"
|
|
" Use this to check for pages that contain some form of error\n"
|
|
" message, for example 'This page has moved' or 'Oracle\n"
|
|
" Application Server error'.\n"
|
|
" This option implies -w.\n") % linkcheck.Config.LoggerKeys
|
|
|
|
Notes = _("NOTES\n"
|
|
"o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL\n"
|
|
" starts with 'www.' resp. 'ftp.'\n"
|
|
" You can also give local files as arguments.\n"
|
|
"o If you have your system configured to automatically establish a\n"
|
|
" connection to the internet (e.g. with diald), it will connect when\n"
|
|
" checking links not pointing to your local host.\n"
|
|
" Use the -s and -i options to prevent this.\n"
|
|
"o Javascript links are currently ignored.\n"
|
|
"o If your platform does not support threading, LinkChecker uses -t0.\n"
|
|
"o You can supply multiple user/password pairs in a configuration file.\n"
|
|
"o Cookies are not accepted by LinkChecker.\n"
|
|
"o To use proxies set $http_proxy, $https_proxy on Unix or Windows.\n"
|
|
" On a Mac use the Internet Config.\n"
|
|
"o When checking 'news:' links the given NNTP host doesn't need to be the\n"
|
|
" same as the host of the user browsing your pages!\n")
|
|
|
|
Examples = _("EXAMPLES\n"
|
|
"o linkchecker -v -ohtml -r2 -s -itreasure.calvinsplayground.de \\\n"
|
|
" http://treasure.calvinsplayground.de/~calvin/ > sample.html\n"
|
|
"o Local files and syntactic sugar on the command line:\n"
|
|
" linkchecker c:\\temp\\test.html\n"
|
|
" linkchecker ../bla.html\n"
|
|
" linkchecker www.myhomepage.de\n"
|
|
" linkchecker -r0 ftp.linux.org\n")
|
|
|
|
def printVersion():
|
|
print linkcheck.Config.AppInfo
|
|
sys.exit(0)
|
|
|
|
def printHelp():
|
|
if os.name!='posix':
|
|
StringUtil.paginate(Usage+"\n"+Notes+"\n"+Examples)
|
|
else:
|
|
print Usage
|
|
print Notes
|
|
print Examples
|
|
sys.exit(0)
|
|
|
|
def printUsage(msg):
|
|
sys.stderr.write(_("Error: %s\n") % msg)
|
|
sys.stderr.write(_("Execute 'linkchecker -h' for help\n"))
|
|
sys.exit(1)
|
|
|
|
|
|
# Read command line arguments
|
|
try:
|
|
# Note: cut out the name of the script
|
|
options, args = getopt.getopt(sys.argv[1:],
|
|
"adDe:f:F:hi:N:o:p:qr:Rst:u:VvwW:", # short options
|
|
["anchors", # long options
|
|
"config=",
|
|
"debug",
|
|
"extern=",
|
|
"file-output=",
|
|
"nntp-server=",
|
|
"help",
|
|
"intern=",
|
|
"denyallow",
|
|
"output=",
|
|
"password=",
|
|
"quiet",
|
|
"recursion-level=",
|
|
"wischiwaschi",
|
|
"robots-txt",
|
|
"strict",
|
|
"threads=",
|
|
"user=",
|
|
"version",
|
|
"verbose",
|
|
"warnings",
|
|
"warning-regex="])
|
|
except getopt.error:
|
|
type, value = sys.exc_info()[:2]
|
|
printUsage(value)
|
|
|
|
# apply configuration
|
|
config = linkcheck.Config.Configuration()
|
|
configfiles = []
|
|
for opt,arg in options:
|
|
if opt=="-f" or opt=="--config":
|
|
configfiles.append(arg)
|
|
elif opt=="-D" or opt=="--debug":
|
|
linkcheck.Config.DebugFlag = 1
|
|
config.disableThreading()
|
|
config.read(configfiles)
|
|
|
|
# apply options and arguments
|
|
_user = "anonymous"
|
|
_password = "guest@"
|
|
constructauth = 0
|
|
for opt,arg in options:
|
|
if opt=="-a" or opt=="--anchors":
|
|
config["anchors"] = 1
|
|
|
|
elif opt=="-e" or opt=="--extern":
|
|
config["externlinks"].append((re.compile(arg), 0))
|
|
|
|
elif opt=="-h" or opt=="--help":
|
|
printHelp()
|
|
|
|
elif opt=="-o" or opt=="--output":
|
|
if linkcheck.Config.Loggers.has_key(arg):
|
|
config['log'] = config.newLogger(arg)
|
|
else:
|
|
printUsage((_("Illegal argument '%s' for option ") % arg) +\
|
|
"'-o, --output'")
|
|
|
|
elif opt=="-F" or opt=="--file-output":
|
|
if linkcheck.Config.Loggers.has_key(arg) and arg != "blacklist":
|
|
config['fileoutput'].append(
|
|
config.newLogger(arg, {'fileoutput':1}))
|
|
else:
|
|
printUsage((_("Illegal argument '%s' for option ") % arg) +\
|
|
"'-F, --file-output'")
|
|
|
|
elif opt=="-i" or opt=="--intern":
|
|
config["internlinks"].append(re.compile(arg))
|
|
|
|
elif opt=="-l" or opt=="--denyallow":
|
|
config["denyallow"] = 1
|
|
|
|
elif opt=="-N" or opt=="--nntp-server":
|
|
config["nntpserver"] = arg
|
|
|
|
elif opt=="-p" or opt=="--password":
|
|
_password = arg
|
|
constructauth = 1
|
|
|
|
elif opt=="-q" or opt=="--quiet":
|
|
config["quiet"] = 1
|
|
|
|
elif opt=="-r" or opt=="--recursion-level":
|
|
if int(arg) >= 0:
|
|
config["recursionlevel"] = int(arg)
|
|
else:
|
|
printUsage((_("Illegal argument '%s' for option ") % arg) +
|
|
"'-r, --recursion-level'")
|
|
|
|
elif opt=="-R" or opt=="--robots-txt":
|
|
config["robotstxt"] = 1
|
|
|
|
elif opt=="-s" or opt=="--strict":
|
|
config["strict"] = 1
|
|
|
|
elif opt=="-t" or opt=="--threads":
|
|
num = int(arg)
|
|
if config["threads"] and not linkcheck.Config.DebugFlag:
|
|
if num>0:
|
|
config.enableThreading(num)
|
|
else:
|
|
config.disableThreading()
|
|
|
|
elif opt=="-u" or opt=="--user":
|
|
_user = arg
|
|
constructauth = 1
|
|
|
|
elif opt=="-V" or opt=="--version":
|
|
printVersion()
|
|
|
|
elif opt=="-v" or opt=="--verbose":
|
|
config["verbose"] = 1
|
|
config["warnings"] = 1
|
|
|
|
elif opt=="--wischiwaschi":
|
|
from linkcheck import util1
|
|
util1.abbuzze()
|
|
sys.exit(0)
|
|
elif opt=="-w" or opt=="--warnings":
|
|
config["warnings"] = 1
|
|
|
|
elif opt=="-W" or opt=="--warning-regex":
|
|
config["warningregex"] = re.compile(arg)
|
|
config["warnings"] = 1
|
|
|
|
if constructauth:
|
|
config["authentication"].insert(0, (re.compile(".*"), _user, _password))
|
|
|
|
# construct the url list
|
|
# if we use blacklist mode, try to read ~/.blacklist
|
|
if config["log"].__class__ == linkcheck.Logging.BlacklistLogger and \
|
|
os.path.exists(config['log'].filename):
|
|
args = open(config['log'].filename).readlines()
|
|
|
|
if len(args)==0:
|
|
print _("warning: no files or urls given")
|
|
|
|
for url in args:
|
|
url = string.strip(url)
|
|
if not (":" in url):
|
|
if re.compile("^ftp\.").match(url):
|
|
url = "ftp://"+url
|
|
elif re.compile("^www\.").match(url):
|
|
url = "http://"+url
|
|
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
|
|
|
|
# check the urls
|
|
linkcheck.checkUrls(config)
|