"""Base URL handler""" # Copyright (C) 2000,2001 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sys, re, urlparse, urllib, time, traceback, socket, select import DNS, Config, StringUtil, linkcheck, linkname from debuglevels import * debug = Config.debug # helper function for internal errors def internal_error (): print >> sys.stderr, linkcheck._("""\n********** Oops, I did it again. ************* You have found an internal error in LinkChecker. Please write a bug report to %s and include the following information. If you disclose some information because its too private to you thats ok. I will try to help you nontheless (but you have to give me *something* I can work with ;). """) % Config.Email type,value = sys.exc_info()[:2] print >> sys.stderr, type, value import traceback traceback.print_exc() print_app_info() print >> sys.stderr, linkcheck._("\n******** LinkChecker internal error, bailing out ********") sys.exit(1) def print_app_info (): import os print >> sys.stderr, linkcheck._("System info:") print >> sys.stderr, Config.App print >> sys.stderr, "Python %s on %s" % (sys.version, sys.platform) for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"): value = os.getenv(key) if value is not None: print >> sys.stderr, key, "=", `value` # we catch these exceptions, all other exceptions are internal # or system errors ExcList = [ IOError, ValueError, # from httplib.py linkcheck.error, DNS.Error, linkcheck.timeoutsocket.Timeout, socket.error, select.error, ] # regular expression to match an HTML tag with one given attribute _linkMatcher = r""" (?i) # case insensitive < # open tag \s* # whitespace %s # tag name \s+ # whitespace [^>]*? # skip leading attributes (fails on Python 2.2b2) %s # attrib name \s* # whitespace = # equal sign \s* # whitespace (?P # attribute value ".*?" | # in double quotes '.*?' | # in single quotes [^\s>]+) # unquoted ([^">]|".*?")* # skip trailing attributes > # close tag """ # ripped mainly from HTML::Tagset.pm LinkTags = ( ("a", ["href"]), ("applet", ["archive", "codebase", "src"]), ("area", ["href"]), ("bgsound", ["src"]), ("blockquote", ["cite"]), ("body", ["background"]), ("del", ["cite"]), ("embed", ["pluginspage", "src"]), ("form", ["action"]), ("frame", ["src", "longdesc"]), ('head', ['profile']), ("iframe", ["src", "longdesc"]), ("ilayer", ["background"]), ("img", ["src", "lowsrc", "longdesc", "usemap"]), ('input', ['src', 'usemap']), ('ins', ['cite']), ('isindex', ['action']), ('layer', ['background', 'src']), ("link", ["href"]), ("meta", ["url"]), # ('object', ['classid', 'codebase', 'data', 'archive', 'usemap']), ('q', ['cite']), ('script', ['src', 'for']), ('table', ['background']), ('td', ['background']), ('th', ['background']), ('tr', ['background']), ('xmp', ['href']), ) LinkPatterns = [] for tag,attrs in LinkTags: for attr in attrs: LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr), re.VERBOSE|re.DOTALL), 'tag': tag, 'attr': attr}) AnchorPattern = { 'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE|re.DOTALL), 'tag': 'a', 'attr': 'name', } BasePattern = { 'pattern': re.compile(_linkMatcher % ("base", "href"), re.VERBOSE), 'tag': 'base', 'attr': 'href', } #CommentPattern = re.compile("