linkchecker/linkcheck/checker/__init__.py

222 lines
7.6 KiB
Python

# -*- coding: iso-8859-1 -*-
"""main function module for link checking"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import time
import sys
import socket
import select
import re
import urlparse
import nntplib
import ftplib
import linkcheck.httplib2
import linkcheck.dns.exception
from linkcheck.i18n import _
# we catch these exceptions, all other exceptions are internal
# or system errors
ExcList = [
IOError,
OSError, # OSError is thrown on Windows when a file is not found
ValueError, # from httplib.py
linkcheck.LinkCheckerError,
linkcheck.dns.exception.DNSException,
socket.timeout,
socket.error,
select.error,
# nttp errors (including EOFError)
nntplib.error_reply,
nntplib.error_temp,
nntplib.error_perm,
nntplib.error_proto,
EOFError,
# http error
linkcheck.httplib2.error,
# ftp errors
ftplib.error_reply,
ftplib.error_temp,
ftplib.error_perm,
ftplib.error_proto,
]
# XXX remove this when depending on python >= 2.4
if hasattr(socket, "sslerror"):
ExcList.append(socket.sslerror)
ignored_schemes = r"""^(
acap # application configuration access protocol
|afs # Andrew File System global file names
|cid # content identifier
|data # data
|dav # dav
|fax # fax
|imap # internet message access protocol
|ldap # Lightweight Directory Access Protocol
|mailserver # Access to data available from mail servers
|mid # message identifier
|mms # multimedia stream
|modem # modem
|nfs # network file system protocol
|opaquelocktoken # opaquelocktoken
|pop # Post Office Protocol v3
|prospero # Prospero Directory Service
|rsync # rsync protocol
|rtsp # real time streaming protocol
|rtspu # real time streaming protocol
|service # service location
|shttp # secure HTTP
|sip # session initiation protocol
|tel # telephone
|tip # Transaction Internet Protocol
|tn3270 # Interactive 3270 emulation sessions
|vemmi # versatile multimedia interface
|wais # Wide Area Information Servers
|z39\.50r # Z39.50 Retrieval
|z39\.50s # Z39.50 Session
|chrome # Mozilla specific
|find # Mozilla specific
|clsid # Microsoft specific
|javascript # JavaScript
|isbn # ISBN (int. book numbers)
):"""
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
# main check function
def check_urls (consumer):
"""Gets a complete configuration object as parameter where all
runtime-dependent options are stored. If you call this function
more than once, you can specify different configurations.
"""
try:
_check_urls(consumer)
except KeyboardInterrupt:
consumer.abort()
def _check_urls (consumer):
consumer.logger_start_output()
start_time = time.time()
status_time = start_time
while not consumer.finished():
consumer.check_url()
if consumer.config['status']:
curtime = time.time()
if (curtime - status_time) > 5:
consumer.print_status(curtime, start_time)
status_time = curtime
consumer.logger_end_output()
# file extensions we can parse recursively
extensions = {
"html": re.compile(r'(?i)\.s?html?$'),
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
}
import linkcheck.checker.fileurl
import linkcheck.checker.ignoredurl
import linkcheck.checker.ftpurl
import linkcheck.checker.gopherurl
import linkcheck.checker.httpurl
import linkcheck.checker.httpsurl
import linkcheck.checker.mailtourl
import linkcheck.checker.telneturl
import linkcheck.checker.nntpurl
import linkcheck.checker.errorurl
def set_intern_url (url, klass, config):
"""add intern url pattern for url given on the command line"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "Set intern url for %r", url)
if klass == linkcheck.checker.fileurl.FileUrl:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Add intern pattern ^file:")
config['internlinks'].append(linkcheck.get_link_pat("^file:"))
elif klass in [linkcheck.checker.httpurl.HttpUrl,
linkcheck.checker.httpsurl.HttpsUrl,
linkcheck.checker.ftpurl.FtpUrl]:
domain = urlparse.urlsplit(url)[1]
if domain:
domain = "://%s" % re.escape(domain)
linkcheck.log.debug(linkcheck.LOG_CHECK, "Add intern domain %r",
domain)
# add scheme colon to link pattern
config['internlinks'].append(linkcheck.get_link_pat(domain))
def absolute_url (base_url, base_ref, parent_url):
"""Search for the absolute url to detect the link type. This does not
join any url fragments together! Returns the url in lower case to
simplify urltype matching."""
if base_url and linkcheck.url.url_is_absolute(base_url):
return base_url.lower()
elif base_ref and linkcheck.url.url_is_absolute(base_ref):
return base_ref.lower()
elif parent_url and linkcheck.url.url_is_absolute(parent_url):
return parent_url.lower()
return ""
def get_url_from (base_url, recursion_level, consumer,
parent_url=None, base_ref=None, line=0, column=0,
name=None, cmdline=None):
"""get url data from given base data"""
if cmdline and linkcheck.url.url_needs_quoting(base_url):
base_url = linkcheck.url.url_quote(base_url)
url = absolute_url(base_url, base_ref, parent_url)
# test scheme
if url.startswith("http:"):
klass = linkcheck.checker.httpurl.HttpUrl
elif url.startswith("ftp:"):
klass = linkcheck.checker.ftpurl.FtpUrl
elif url.startswith("file:"):
klass = linkcheck.checker.fileurl.FileUrl
elif url.startswith("telnet:"):
klass = linkcheck.checker.telneturl.TelnetUrl
elif url.startswith("mailto:"):
klass = linkcheck.checker.mailtourl.MailtoUrl
elif url.startswith("gopher:"):
klass = linkcheck.checker.gopherurl.GopherUrl
elif url.startswith("https:"):
klass = linkcheck.checker.httpsurl.HttpsUrl
elif url.startswith("nntp:") or \
url.startswith("news:") or \
url.startswith("snews:"):
klass = linkcheck.checker.nntpurl.NntpUrl
elif ignored_schemes_re.search(url):
# ignored url
klass = linkcheck.checker.ignoredurl.IgnoredUrl
elif cmdline:
# assume local file on command line
klass = linkcheck.checker.fileurl.FileUrl
else:
# error url, no further checking, just log this
klass = linkcheck.checker.errorurl.ErrorUrl
if cmdline and not (consumer.config['internlinks'] or
consumer.config['externlinks']):
# set automatic intern/extern stuff if no filter was given
set_intern_url(url, klass, consumer.config)
return klass(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)