# -*- coding: iso-8859-1 -*- """main function module for link checking""" # Copyright (C) 2000-2004 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import time import re import sys import urlparse # logger areas LOG_CMDLINE = "linkcheck.cmdline" LOG_CHECK = "linkcheck.check" LOG_DNS = "linkcheck.dns" LOG_GUI = "linkcheck.gui" class LinkCheckerError (Exception): pass def getLinkPat (arg, strict=False): """get a link pattern matcher for intern/extern links""" linkcheck.log.debug(LOG_CHECK, "Link pattern %r", arg) if arg[0:1] == '!': pattern = arg[1:] negate = True else: pattern = arg negate = False return { "pattern": re.compile(pattern), "negate": negate, "strict": strict, } # file extensions we can parse recursively extensions = { "html": re.compile(r'(?i)\.s?html?$'), "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file "css": re.compile(r'(?i)\.css$'), # CSS stylesheet # "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'), } # main check function def checkUrls (config): """ checkUrls gets a complete configuration object as parameter where all runtime-dependent options are stored. If you call checkUrls more than once, you can specify different configurations. In the config object there are functions to get a new URL (getUrl) and to check it (checkUrl). """ config.log_init() try: start_time = time.time() status_time = start_time while True: if config.hasMoreUrls(): config.checkUrl(config.getUrl()) elif config.finished(): break else: # active connections are downloading/parsing, so # wait a little time.sleep(0.1) if config['status']: curtime = time.time() if (curtime - status_time) > 5: printStatus(config, curtime, start_time) status_time = curtime config.log_endOfOutput() except KeyboardInterrupt: config.finish() config.log_endOfOutput() active = config.threader.active_threads() linkcheck.log.warn(LOG_CHECK, linkcheck.i18n._("keyboard interrupt; waiting for %d active threads to finish") % active) raise import linkcheck.logger import linkcheck.logger.FileUrlData import linkcheck.logger.IgnoredUrlData import linkcheck.logger.FtpUrlData import linkcheck.logger.GopherUrlData import linkcheck.logger.HttpUrlData import linkcheck.logger.HttpsUrlData import linkcheck.logger.MailtoUrlData import linkcheck.logger.TelnetUrlData import linkcheck.logger.NntpUrlData def set_intern_url (url, klass, config): """Precondition: config['strict'] is true (ie strict checking) and recursion level is zero (ie url given on the command line)""" if klass == linkcheck.logger.FileUrlData.FileUrlData: linkcheck.log.debug(LOG_CHECK, "Add intern pattern ^file:") config['internlinks'].append(getLinkPat("^file:")) elif klass in [linkcheck.logger.HttpUrlData.HttpUrlData, linkcheck.logger.HttpsUrlData.HttpsUrlData, linkcheck.logger.FtpUrlData.FtpUrlData]: domain = urlparse.urlsplit(url)[1] if domain: domain = "://%s"%re.escape(domain) debug(BRING_IT_ON, "Add intern domain", domain) # add scheme colon to link pattern config['internlinks'].append(getLinkPat(domain)) def getUrlDataFrom (urlName, recursionLevel, config, parentName=None, baseRef=None, line=0, column=0, name=None, cmdline=None): url = get_absolute_url(urlName, baseRef, parentName) # test scheme if url.startswith("http:"): klass = linkcheck.logger.HttpUrlData.HttpUrlData elif url.startswith("ftp:"): klass = linkcheck.logger.FtpUrlData.FtpUrlData elif url.startswith("file:"): klass = linkcheck.logger.FileUrlData.FileUrlData elif url.startswith("telnet:"): klass = linkcheck.logger.TelnetUrlData.TelnetUrlData elif url.startswith("mailto:"): klass = linkcheck.logger.MailtoUrlData.MailtoUrlData elif url.startswith("gopher:"): klass = linkcheck.logger.GopherUrlData.GopherUrlData elif url.startswith("https:"): klass = linkcheck.logger.HttpsUrlData.HttpsUrlData elif url.startswith("nttp:") or \ url.startswith("news:") or \ url.startswith("snews:"): klass = linkcheck.logger.NntpUrlData.NntpUrlData # application specific links are ignored elif ignored_schemes_re.search(url): klass = linkcheck.logger.IgnoredUrlData.IgnoredUrlData # assume local file else: klass = linkcheck.logger.FileUrlData.FileUrlData if config['strict'] and cmdline and \ not (config['internlinks'] or config['externlinks']): # set automatic intern/extern stuff if no filter was given set_intern_url(url, klass, config) return klass(urlName, recursionLevel, config, parentName, baseRef, line=line, column=column, name=name) def printStatus (config, curtime, start_time): tocheck = len(config.urls) links = config['linknumber'] active = config.threader.active_threads() duration = linkcheck.logger.strduration(curtime - start_time) print >>sys.stderr, linkcheck.i18n._("%5d urls queued, %4d links checked, %2d active threads, runtime %s")%\ (tocheck, links, active, duration)