linkchecker/linkcheck/__init__.py
2004-07-07 18:15:17 +00:00

95 lines
3.2 KiB
Python

# -*- coding: iso-8859-1 -*-
"""main function module for link checking"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import re
import sys
import urlparse
# logger areas
LOG_CMDLINE = "linkcheck.cmdline"
LOG_CHECK = "linkcheck.check"
LOG_DNS = "linkcheck.dns"
LOG_GUI = "linkcheck.gui"
class LinkCheckerError (Exception):
pass
def getLinkPat (arg, strict=False):
"""get a link pattern matcher for intern/extern links"""
linkcheck.log.debug(LOG_CHECK, "Link pattern %r", arg)
if arg[0:1] == '!':
pattern = arg[1:]
negate = True
else:
pattern = arg
negate = False
return {
"pattern": re.compile(pattern),
"negate": negate,
"strict": strict,
}
# file extensions we can parse recursively
extensions = {
"html": re.compile(r'(?i)\.s?html?$'),
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
# "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'),
}
import linkcheck.FileUrlData
import linkcheck.IgnoredUrlData
import linkcheck.FtpUrlData
import linkcheck.GopherUrlData
import linkcheck.HttpUrlData
import linkcheck.HttpsUrlData
import linkcheck.MailtoUrlData
import linkcheck.TelnetUrlData
import linkcheck.NntpUrlData
def set_intern_url (url, klass, config):
"""Precondition: config['strict'] is true (ie strict checking) and
recursion level is zero (ie url given on the command line)"""
if klass == linkcheck.FileUrlData.FileUrlData:
linkcheck.log.debug(LOG_CHECK, "Add intern pattern ^file:")
config['internlinks'].append(getLinkPat("^file:"))
elif klass in [linkcheck.HttpUrlData.HttpUrlData,
linkcheck.HttpsUrlData.HttpsUrlData,
linkcheck.FtpUrlData.FtpUrlData]:
domain = urlparse.urlsplit(url)[1]
if domain:
domain = "://%s"%re.escape(domain)
debug(BRING_IT_ON, "Add intern domain", domain)
# add scheme colon to link pattern
config['internlinks'].append(getLinkPat(domain))
import linkcheck.logger
def printStatus (config, curtime, start_time):
tocheck = len(config.urls)
links = config['linknumber']
active = config.threader.active_threads()
duration = linkcheck.logger.strduration(curtime - start_time)
print >>sys.stderr, linkcheck.i18n._("%5d urls queued, %4d links checked, %2d active threads, runtime %s")%\
(tocheck, links, active, duration)