diff --git a/linkchecker b/linkchecker index c9a865ff..8c0e34ac 100755 --- a/linkchecker +++ b/linkchecker @@ -42,112 +42,7 @@ _profile = "linkchecker.prof" # main usage text Usage = i18n._("""USAGE\tlinkchecker [options] file-or-url... - -OPTIONS -For single-letter option arguments the space is not a necessity. So -'-o colored' is the same as '-ocolored'. --a, --anchors - Check HTTP anchor references. This option applies to both internal - and external urls. Default is don't check anchors. - This option implies -w because anchor errors are always warnings. --C, --cookies - Accept and send HTTP cookies according to RFC 2109. Only cookies - which are sent back to the originating server are accepted. - Sent and accepted cookies are provided as additional logging - information. --d, --denyallow - Swap checking order to external/internal. Default checking order - is internal/external. --D, --debug - Print debugging information. Provide this option multiple times - for even more debugging information. --e regex, --extern=regex - Assume urls that match the given expression as external. - Only internal HTML links are checked recursively. --f file, --config=file - Use file as configuration file. As default LinkChecker first - searches /etc/linkcheckerrc and then ~/.linkcheckerrc - (under Windows \\linkcheckerrc). --F type[/filename], --file-output=type[/filename] - Same as -o, but write to a file linkchecker-out. - or if specified. If the file already exists, it - is overwritten. You can specify this option more than once. - There is no file output for the blacklist logger. Default is - no file output. --I, --interactive - Ask for url if none are given on the commandline. --i regex, --intern=regex - Assume URLs that match the given expression as internal. - LinkChecker descends recursively only to internal URLs, not to - external. --h, --help - Help me! Print usage information for this program. --N server, --nntp-server=server - Specify an NNTP server for 'news:...' links. Default is the - environment variable NNTP_SERVER. If no host is given, - only the syntax of the link is checked. ---no-anchor-caching - Treat url#anchora and url#anchorb as equal on caching. This - is the default browser behaviour, but it's not specified in - the URI specification. Use with care. --o type, --output=type - Specify output type as %s. - Default type is text. --p pwd, --password=pwd - Try password pwd for HTTP and FTP authorization. - Default password is 'joe@'. See also -u. --P secs, --pause=secs - Pause seconds between each url check. This option - implies -t0. - Default is no pause between requests. ---profile - Write profiling data into a file named %s in the - current working directory. - See also --viewprof. --q, --quiet - Quiet operation. This is only useful with -F. --r depth, --recursion-level=depth - Check recursively all links up to given depth. A negative depth - will enable inifinite recursion. - Default depth is 1. --s, --strict - Check only syntax of external links, do not try to connect to them. - For local file urls, only local files are internal. For - http and ftp urls, all urls at the same domain name are internal. ---status - Print check status every 5 seconds to stderr. --t num, --threads=num - Generate no more than num threads. Default number of threads is 5. - To disable threading specify a non-positive number. ---timeout=secs - Set the timeout for TCP connection attempts in seconds. The default - timeout is 30 seconds. --u name, --user=name - Try username name for HTTP and FTP authorization. - Default is 'anonymous'. See also -p. --V, --version - Print version and exit. --v, --verbose - Log all checked URLs (implies -w). Default is to log only invalid - URLs. ---viewprof - Print out previously generated profiling data. See also --profile. --w, --warnings - Log warnings. --W regex, --warning-regex=regex - Define a regular expression which prints a warning if it matches - any content of the checked link. - This applies of course only to pages which are valid, so we can - get their content. - Use this to check for pages that contain some form of error - message, for example 'This page has moved' or 'Oracle - Application Server error'. - This option implies -w. ---warning-size-bytes=bytes - Print a warning if content size is available and exceeds the given - number of bytes. - This option implies -w. -""") % (LoggerKeys, _profile) +""") Notes = i18n._("""NOTES o A ! before any regex negates it. So '!^mailto:' matches everything but @@ -220,193 +115,263 @@ def viewprof (): sys.exit(0) # Read command line arguments -try: - # Note: cut out the name of the script - options, args = getopt.getopt(sys.argv[1:], - "adCDe:f:F:hIi:N:o:p:P:qr:Rst:u:VvwW:", # short options - ["anchors", # long options - "config=", - "cookies", - "debug", - "extern=", - "file-output=", - "nntp-server=", - "help", - "interactive", - "intern=", - "denyallow", - "output=", - "password=", - "pause=", - "profile", - "quiet", - "recursion-level=", - "no-anchor-caching", - "wischiwaschi", - "robots-txt", - "strict", - "status", - "threads=", - "timeout=", - "user=", - "version", - "verbose", - "viewprof", - "warnings", - "warning-regex="]) -except getopt.error: - type, value = sys.exc_info()[:2] - printUsage(value) +from optparse import OptionParser +optparser = OptionParser() + +optparser.add_option("-a", "--anchors", action="store_true", dest="anchors", + help=i18n._( +"""Check HTTP anchor references. This option applies to both internal +and external urls. Default is don't check anchors. +This option implies -w because anchor errors are always warnings.""")) + +optparser.add_option("-C", "--cookies", action="store_true", dest="cookies", + help=i18n._( +"""Accept and send HTTP cookies according to RFC 2109. Only cookies +which are sent back to the originating server are accepted. +Sent and accepted cookies are provided as additional logging +information.""")) + +optparser.add_option("-d", "--denyallow", dest="denyallow", + help=i18n._( +"""Swap checking order to external/internal. Default checking order +is internal/external.""")) + +optparser.add_option("-e", "--extern", type="string", action="append", dest="extern", + help=i18n._( +"""Assume urls that match the given expression as external. +Only internal HTML links are checked recursively.""")) + +optparser.add_option("-f", "--config", type="string", dest="configfile", + help=i18n._( +"""Use file as configuration file. As default LinkChecker first +searches /etc/linkcheckerrc and then ~/.linkcheckerrc +(under Windows \\linkcheckerrc).""")) + +optparser.add_option("-F", "--file-output", type="string", dest="fileoutput", + help=i18n._( +"""type[/filename] +Same as -o, but write to a file linkchecker-out. +or if specified. If the file already exists, it +is overwritten. You can specify this option more than once. +There is no file output for the blacklist logger. Default is +no file output.""")) + +optparser.add_option("-D", "--debug", action="count", + help=i18n._( +"""Print debugging information. Provide this option multiple times +for even more debugging information.""")) + +optparser.add_option("-I", "--interactive", action="store_true", dest="interactive", + help=i18n._( +"""Ask for url if none are given on the commandline.""")) + +optparser.add_option("-i", "--intern", type="string", action="append", dest="intern", + help=i18n._( +""" regex, --intern=regex +Assume URLs that match the given expression as internal. +LinkChecker descends recursively only to internal URLs, not to +external.""")) + +optparser.add_option("-N", "--nntp-server", type="string", dest="nntpserver", + help=i18n._( +"""Specify an NNTP server for 'news:...' links. Default is the +environment variable NNTP_SERVER. If no host is given, +only the syntax of the link is checked.""")) + +optparser.add_option("--no-anchor-caching", action="store_false", dest="anchorcaching", + help=i18n._( +"""Treat url#anchora and url#anchorb as equal on caching. This +is the default browser behaviour, but it's not specified in +the URI specification. Use with care.""")) + +optparser.add_option("-o", "--output", type="string", dest="output", + help=i18n._( +"""Specify output type as %s. Default type is text.""")%LoggerKeys) + +optparser.add_option("-p", "--password", type="string", dest="password", + help=i18n._( +"""Try password pwd for HTTP and FTP authorization. +Default password is 'joe@'. See also -u.""")) + +optparser.add_option("-P", "--pause", type="int", dest="pause", + help=i18n._( +"""Pause seconds between each url check. This option implies -t0. +Default is no pause between requests.""")) + +optparser.add_option("--profile", action="store_true", dest="profile", + help=i18n._( +"""Write profiling data into a file named %s in the +current working directory. See also --viewprof.""")%_profile) + +optparser.add_option("-q", "--quiet", action="store_true", dest="quiet", + help=i18n._( +"""Quiet operation. This is only useful with -F.""")) + +optparser.add_option("-r", "--recursion-level", type="int", dest="recursionlevel", + help=i18n._( +"""Check recursively all links up to given depth. A negative depth +will enable inifinite recursion. Default depth is 1.""")) + +optparser.add_option("-s", "--strict", action="store_true", dest="strict", + help=i18n._( +"""Check only syntax of external links, do not try to connect to them. +For local file urls, only local files are internal. For +http and ftp urls, all urls at the same domain name are internal.""")) + +optparser.add_option("--status", action="store_true", dest="status", + help=i18n._( +"""Print check status every 5 seconds to stderr.""")) + +optparser.add_option("-t", "--threads", type="int", dest="threads", + help=i18n._( +"""Generate no more than num threads. Default number of threads is 5. +To disable threading specify a non-positive number.""")) + +optparser.add_option("--timeout", type="int", dest="timeout", + help=i18n._( +"""Set the timeout for TCP connection attempts in seconds. The default +timeout is 30 seconds.""")) + +optparser.add_option("-u", "--user", type="string", dest="username", + help=i18n._( +"""Try username name for HTTP and FTP authorization. +Default is 'anonymous'. See also -p.""")) + +optparser.add_option("-V", "--version", dest="version", + help=i18n._( +"""Print version and exit.""")) + +optparser.add_option("-v", "--verbose", action="store_true", dest="verbose", + help=i18n._( +"""Log all checked URLs (implies -w). Default is to log only invalid +URLs.""")) + +optparser.add_option("--viewprof", action="store_true", dest="viewprof", + help=i18n._( +"""Print out previously generated profiling data. See also --profile.""")) + +optparser.add_option("-w", "--warnings", action="store_true", dest="warnings", + help=i18n._("""Log warnings.""")) + +optparser.add_option("-W", "--warning-regex", type="string", dest="warningregex", + help=i18n._( +"""Define a regular expression which prints a warning if it matches +any content of the checked link. +This applies of course only to pages which are valid, so we can +get their content. +Use this to check for pages that contain some form of error +message, for example 'This page has moved' or 'Oracle +Application Server error'. +This option implies -w.""")) + +optparser.add_option("--warning-size-bytes", dest="warningsizebytes", + help=i18n._( +"""Print a warning if content size is available and exceeds the given +number of bytes. This option implies -w.""")) + +if "--wischiwaschi" in sys.argv: + from linkcheck import util1 + util1.abbuzze() + sys.exit(0) + +(options, args) = optparser.parse_args() # set debug level as early as possible -for opt,arg in options: - if opt=="-D" or opt=="--debug": - set_debuglevel(get_debuglevel()+1) +if options.debug is not None: + set_debuglevel(options.debug) debug(BRING_IT_ON, "Python", sys.version, "on", sys.platform) -# read configuration from config files +# config object config = linkcheck.Config.Configuration() +# read configuration from config files configfiles = [] -for opt,arg in options: - if opt=="-f" or opt=="--config": - configfiles.append(arg) +if options.configfile: + configfiles.append(options.configfile) config.read(configfiles) # apply commandline options and arguments _user = "anonymous" _password = "guest@" constructauth = False do_profile = False -for opt,arg in options: - if opt=="-a" or opt=="--anchors": - config["anchors"] = True - config["warnings"] = True - - elif opt=="-e" or opt=="--extern": - config["externlinks"].append(linkcheck.getLinkPat(arg)) - - elif opt=="-h" or opt=="--help": - printHelp() - - elif opt=="-o" or opt=="--output": - if linkcheck.log.Loggers.has_key(arg): - config['log'] = config.newLogger(arg) - else: - printUsage((i18n._("Illegal argument '%s' for option ") % arg) +\ - "'-o, --output'") - - elif opt=="-F" or opt=="--file-output": - ns = {'fileoutput': 1} - try: - type, ns['filename'] = arg.split('/', 1) - if not ns['filename']: raise ValueError - except ValueError: type = arg - if linkcheck.log.Loggers.has_key(type) and type != "blacklist": - config['fileoutput'].append(config.newLogger(type, ns)) - else: - printUsage((i18n._("Illegal argument '%s' for option ") % arg) +\ - "'-F, --file-output'") - - elif opt=="-I" or opt=="--interactive": - config['interactive'] = True - - elif opt=="-i" or opt=="--intern": - config["internlinks"].append(linkcheck.getLinkPat(arg)) - - elif opt=="-l" or opt=="--denyallow": - config["denyallow"] = True - - elif opt=="-N" or opt=="--nntp-server": - config["nntpserver"] = arg - - elif opt=="--no-anchor-caching": - config["anchorcaching"] = False - - elif opt=="-p" or opt=="--password": - _password = arg - constructauth = True - - elif opt=="-P" or opt=="--pause": - try: - wait = int(arg) - except ValueError: - printUsage(i18n._("Illegal argument %r for option %s") % \ - (arg, "'-P, --pause'")) - if wait >= 0: - config["wait"] = wait - else: - printUsage(i18n._("Illegal argument %r for option %s") % \ - (arg, "'-P, --pause'")) - - elif opt=="--profile": - do_profile = True - - elif opt=="-q" or opt=="--quiet": - config["quiet"] = True - - elif opt=="-r" or opt=="--recursion-level": - try: - depth = int(arg) - if depth >= 0: - config["recursionlevel"] = depth - else: - config["recursionlevel"] = -1 - except ValueError: - printUsage(i18n._("Illegal argument %r for option %s") % \ - (arg, "'-r, --recursion-level'")) - # robots.txt is now default, so ignore this option - elif opt=="-R" or opt=="--robots-txt": pass - - elif opt=="-s" or opt=="--strict": - config["strict"] = True - - elif opt=="--status": - config['status'] = True - - elif opt=="-t" or opt=="--threads": - try: - config.setThreads(int(arg)) - except ValueError: - printUsage(i18n._("Illegal argument %r for option %s") % \ - (arg, "'-t, --threads'")) - - elif opt=="--timeout": - try: - timeout = int(arg) - if timeout <= 0: - printUsage(i18n._("Illegal argument %r for option %s") % \ - (arg, "'--timeout'")) - socket.setdefaulttimeout(timeout) - except ValueError: - printUsage(i18n._("Illegal argument %r for option %s") % \ - (arg, "'--timeout'")) - - elif opt=="-u" or opt=="--user": - _user = arg - constructauth = True - - elif opt=="-V" or opt=="--version": - printVersion() - - elif opt=="-v" or opt=="--verbose": +if options.anchors is not None: + config["anchors"] = options.anchors + config["warnings"] = True +if options.extern: + config["externlinks"].extend([linkcheck.getLinkPat(arg) for arg in options.extern]) +if options.output: + if linkcheck.log.Loggers.has_key(options.output): + config['log'] = config.newLogger(options.output) + else: + printUsage(i18n._("Illegal argument %r for option %s") % \ + (options.output, "'-o, --output'")) +if options.fileoutput: + ns = {'fileoutput': 1} + try: + ftype, ns['filename'] = options.fileoutput.split('/', 1) + if not ns['filename']: raise ValueError + except ValueError: + ftype = options.fileoutput + if linkcheck.log.Loggers.has_key(ftype) and ftype != "blacklist": + config['fileoutput'].append(config.newLogger(ftype, ns)) + else: + printUsage(i18n._("Illegal argument %r for option %s") % \ + (options.fileoutput, "'-F, --file-output'")) +if options.interactive is not None: + config['interactive'] = options.interactive +if options.intern: + config["internlinks"].extend([linkcheck.getLinkPat(arg) for arg in options.intern]) +if options.denyallow is not None: + config["denyallow"] = options.denyallow +if options.nntpserver: + config["nntpserver"] = options.nntpserver +if options.anchorcaching is not None: + config["anchorcaching"] = options.anchorcaching +if options.password is not None: + _password = options.password + constructauth = True +if options.pause is not None: + if options.pause >= 0: + config["wait"] = options.pause + else: + printUsage(i18n._("Illegal argument %d for option %s") % \ + (options.pause, "'-P, --pause'")) +if options.profile is not None: + do_profile = options.profile +if options.quiet is not None: + config["quiet"] = options.quiet +if options.recursionlevel is not None: + config["recursionlevel"] = options.recursionlevel +if options.strict is not None: + config["strict"] = options.strict +if options.status is not None: + config['status'] = options.status +if options.threads is not None: + config.setThreads(options.threads) +if options.timeout is not None: + if options.timeout > 0: + socket.setdefaulttimeout(options.timeout) + else: + printUsage(i18n._("Illegal argument %r for option %s") % \ + (options.timeout, "'--timeout'")) +if options.username is not None: + _user = options.username + constructauth = True +if options.version is not None: + printVersion() +if options.verbose is not None: + if options.verbose: config["verbose"] = True config["warnings"] = True - - elif opt=="--viewprof": - viewprof() - - elif opt=="--wischiwaschi": - from linkcheck import util1 - util1.abbuzze() - sys.exit(0) - elif opt=="-w" or opt=="--warnings": - config["warnings"] = True - - elif opt=="-W" or opt=="--warning-regex": - config["warningregex"] = re.compile(arg) - config["warnings"] = True - - elif opt=="-C" or opt=="--cookies": - config['cookies'] = True - +if options.viewprof: + viewprof() +if options.warnings is not None: + config["warnings"] = options.warnings +if options.warningregex is not None: + config["warningregex"] = re.compile(options.warningregex) + config["warnings"] = True +if options.warningsizebytes is not None: + config["warnsizebytes"] = options.warningsizebytes +if options.cookies is not None: + config['cookies'] = options.cookies if constructauth: config["authentication"].insert(0, {'pattern': re.compile(".*"), 'user': _user,