#!$python import sys # no i18n at this point if sys.version[:5] < "1.5.2": sys.stderr.write("This program requires Python 1.5.2 or later.\n") sys.exit(1) # add the path to linkcheck module if you do not install with distutils $syspath import getopt,re,string,os,urlparse import linkcheck,StringUtil from linkcheck import _ Usage = _("USAGE\tlinkchecker [options] file_or_url...\n" "\n" "OPTIONS\n" "-a, --anchors\n" " Check anchor references. Default is don't check anchors.\n" "-D, --debug\n" " Print additional debugging information.\n" "-e regex, --extern=regex\n" " Assume urls that match the given expression as extern.\n" " Only intern HTTP links are checked recursively.\n" "-f file, --config=file\n" " Use file as configuration file. LinkChecker first searches\n" " ~/.linkcheckerrc and then /etc/linkcheckerrc\n" " (under Windows \\linkcheckerrc).\n" "-F name, --file-output=name\n" " Same as output, but write to a file linkchecker-out..\n" " If the file already exists, it is overwritten. You can specify\n" " this option more than once. There is no file output for the\n" " blacklist logger. Default is no file output.\n" "-i regex, --intern=regex\n" " Assume urls that match the given expression as intern.\n" "-h, --help\n" " Help me! Print usage information for this program.\n" "-l, --allowdeny\n" " Swap checking order to intern/extern. Default checking order\n" " is extern/intern.\n" "-N, --nntp-server\n" " Specify an NNTP server for 'news:...' links. Default is the\n" " environment variable NNTP_SERVER. If no host is given,\n" " only the syntax of the link is checked.\n" "-o name, --output=name\n" " Specify output as %s.\n" " Default is text.\n" "-p pwd, --password=pwd\n" " Try given password for HTML and FTP authorization.\n" " Default is 'guest@'. See -u.\n" "-P host[:port], --proxy=host[:port]\n" " Use specified proxy for HTTP requests.\n" " Standard port is 8080. Default is to use no proxy.\n" "-q, --quiet\n" " Quiet operation. This is only useful with -F.\n" "-r depth, --recursion-level=depth\n" " Check recursively all links up to given depth (depth >= 0).\n" " Default depth is 1.\n" "-R, --robots-txt\n" " Obey the robots exclusion standard.\n" "-s, --strict\n" " Check only syntax of extern links, do not try to connect to them.\n" "-t num, --threads=num\n" " Generate no more than num threads. Default number of threads is 5.\n" " To disable threading specify a non-positive number.\n" "-u name, --user=name\n" " Try given username for HTML and FTP authorization.\n" " Default is 'anonymous'. See -p.\n" "-V, --version\n" " Print version and exit.\n" "-v, --verbose\n" " Log all checked URLs (implies -w). Default is to log only invalid\n" " URLs.\n" "-w, --warnings\n" " Log warnings.\n" "-W regex, --warning-regex=regex\n" " Define a regular expression which prints a warning if it matches\n" " any content of the checked link.\n" " This applies of course only to pages which are valid, so we can\n" " get their content.\n" " You can use this to check for pages that contain some form of\n" " error message, for example 'This page has moved' or\n" " 'Oracle Application Server error'.\n" " This option implies -w.\n") % linkcheck.Config.LoggerKeys Notes = _("NOTES\n" "o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL\n" " starts with 'www.' resp. 'ftp.'\n" " You can also give local files as arguments\n" "o If you have your system configured to automatically establish a\n" " connection to the internet (e.g. with diald), it will connect when\n" " checking links not pointing to your local host\n" " Use the -s and -i options to prevent this (see EXAMPLES)\n" "o Javascript links are currently ignored\n" "o If your platform does not support threading, LinkChecker uses -t0\n" "o You can supply multiple user/password pairs in a configuration file\n" "o Cookies are not accepted by LinkChecker\n" "o When checking 'news:' links the given NNTP host doesn't need to be the\n" " same as the host of the user browsing your pages!\n") Examples = _("EXAMPLES\n" "o linkchecker -v -o html -r2 -s -i treasure.calvinsplayground.de \\\n" " http://treasure.calvinsplayground.de/~calvin/ > sample.html\n" "o Local files and syntactic sugar on the command line:\n" " linkchecker c:\\temp\\test.html\n" " linkchecker ../bla.html\n" " linkchecker www.myhomepage.de\n" " linkchecker -r0 ftp.linux.org\n") def printVersion(): print linkcheck.Config.AppInfo sys.exit(0) def printHelp(): if os.name!='posix': StringUtil.paginate(Usage+"\n"+Notes+"\n"+Examples) else: print Usage print Notes print Examples sys.exit(0) def printUsage(msg): sys.stderr.write(_("Error: %s\n") % msg) sys.stderr.write(_("Execute 'linkchecker -h' for help\n")) sys.exit(1) # Read command line arguments try: # Note: cut out the name of the script options, args = getopt.getopt(sys.argv[1:], "aDe:f:F:hi:lN:P:o:p:qr:Rst:u:VvwW:", # short options ["anchors", # long options "config=", "debug", "extern=", "file-output=", "nntp-server=", "help", "intern=", "allowdeny", "output=", "proxy=", "password=", "quiet", "recursion-level=", "wischiwaschi", "robots-txt", "strict", "threads=", "user=", "version", "verbose", "warnings", "warning-regex="]) except getopt.error: type, value = sys.exc_info()[:2] printUsage(value) # apply configuration config = linkcheck.Config.Configuration() configfiles = [] for opt,arg in options: if opt=="-f" or opt=="--config": configfiles.append(arg) elif opt=="-D" or opt=="--debug": linkcheck.Config.DebugFlag = 1 config.disableThreading() config.read(configfiles) # if no proxy is given, fall back to http_proxy environment variable if os.environ.has_key('http_proxy') and not config['proxy']: config['proxy'] = urlparse.urlparse(os.environ["http_proxy"])[1] if string.find(config['proxy'], ':') != -1: config['proxy'],port = string.split(config['proxy'], ':') config['proxyport'] = int(port) # apply options and arguments _user = "anonymous" _password = "guest@" constructauth = 0 for opt,arg in options: if opt=="-a" or opt=="--anchors": config["anchors"] = 1 elif opt=="-e" or opt=="--extern": config["externlinks"].append((re.compile(arg), 0)) elif opt=="-h" or opt=="--help": printHelp() elif opt=="-o" or opt=="--output": if linkcheck.Config.Loggers.has_key(arg): config['log'] = config.newLogger(arg) else: printUsage((_("Illegal argument '%s' for option ") % arg) +\ "'-o, --output'") elif opt=="-F" or opt=="--file-output": if linkcheck.Config.Loggers.has_key(arg) and arg != "blacklist": config['fileoutput'].append( config.newLogger(arg, {'fileoutput':1})) else: printUsage((_("Illegal argument '%s' for option ") % arg) +\ "'-F, --file-output'") elif opt=="-i" or opt=="--intern": config["internlinks"].append(re.compile(arg)) elif opt=="-l" or opt=="--allowdeny": config["allowdeny"] = 1 elif opt=="-N" or opt=="--nntp-server": config["nntpserver"] = arg elif opt=="-P" or opt=="--proxy": proxy = re.compile("(.+):(.+)").match(arg) if proxy: config["proxy"] = proxy.group(1) config["proxyport"] = int(proxy.group(2)) else: config["proxy"] = arg elif opt=="-p" or opt=="--password": _password = arg constructauth = 1 elif opt=="-q" or opt=="--quiet": config["quiet"] = 1 elif opt=="-r" or opt=="--recursion-level": if int(arg) >= 0: config["recursionlevel"] = int(arg) else: printUsage((_("Illegal argument '%s' for option ") % arg) + "'-r, --recursion-level'") elif opt=="-R" or opt=="--robots-txt": config["robotstxt"] = 1 elif opt=="-s" or opt=="--strict": config["strict"] = 1 elif opt=="-t" or opt=="--threads": num = int(arg) if config["threads"] and not linkcheck.Config.DebugFlag: if num>0: config.enableThreading(num) else: config.disableThreading() elif opt=="-u" or opt=="--user": _user = arg constructauth = 1 elif opt=="-V" or opt=="--version": printVersion() elif opt=="-v" or opt=="--verbose": config["verbose"] = 1 config["warnings"] = 1 elif opt=="--wischiwaschi": import util1 util1.abbuzze() sys.exit(0) elif opt=="-w" or opt=="--warnings": config["warnings"] = 1 elif opt=="-W" or opt=="--warning-regex": config["warningregex"] = re.compile(arg) config["warnings"] = 1 if constructauth: config["authentication"].insert(0, (re.compile(".*"), _user, _password)) # construct the url list # if we use blacklist mode, try to read ~/.blacklist if config["log"].__class__ == linkcheck.Logging.BlacklistLogger and \ os.path.exists(config['log'].filename): args = open(config['log'].filename).readlines() if len(args)==0: print _("warning: no files or urls given") for url in args: url = string.strip(url) if not (":" in url): if re.compile("^ftp\.").match(url): url = "ftp://"+url elif re.compile("^www\.").match(url): url = "http://"+url config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0)) # check the urls linkcheck.checkUrls(config)