"""Base URL handler""" # Copyright (C) 2000,2001 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sys, re, urlparse, urllib, time, traceback, socket, select import Config, StringUtil, linkcheck, linkname, test_support from debuglevels import * debug = Config.debug # helper function for internal errors def internal_error (): print >> sys.stderr, linkcheck._("""\n********** Oops, I did it again. ************* You have found an internal error in LinkChecker. Please write a bug report at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913 or send mail to %s and include the following information: 1) The URL or file you are testing 2) Your commandline arguments and/or configuration. 3) The system information below. If you disclose some information because its too private to you thats ok. I will try to help you nontheless (but you have to give me *something* I can work with ;). """) % Config.Email type,value = sys.exc_info()[:2] print >> sys.stderr, type, value import traceback traceback.print_exc() print_app_info() print >> sys.stderr, linkcheck._("\n******** LinkChecker internal error, bailing out ********") sys.exit(1) def print_app_info (): import os print >> sys.stderr, linkcheck._("System info:") print >> sys.stderr, Config.App print >> sys.stderr, "Python %s on %s" % (sys.version, sys.platform) for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"): value = os.getenv(key) if value is not None: print >> sys.stderr, key, "=", `value` # we catch these exceptions, all other exceptions are internal # or system errors ExcList = [ IOError, ValueError, # from httplib.py linkcheck.error, linkcheck.DNS.Error, linkcheck.timeoutsocket.Timeout, socket.error, select.error, ] if hasattr(socket, "sslerror"): ExcList.append(socket.sslerror) # regular expression to match an HTML tag with one given attribute _linkMatcher = r""" (?i) # case insensitive < # open tag \s* # whitespace %s # tag name \s+ # whitespace ([^"'>]|"[^"]"|'[^']')*? # skip leading attributes %s # attrib name \s* # whitespace = # equal sign \s* # whitespace (?P # attribute value "[^"]*" | # in double quotes '[^']*' | # in single quotes [^\s>]+) # unquoted ([^"'>]|"[^"]"|'[^']')* # skip trailing attributes > # close tag """ # ripped mainly from HTML::Tagset.pm LinkTags = ( (['a'], ['href']), (['applet'], ['archive', 'codebase', 'src']), (['area'], ['href']), (['bgsound'], ['src']), (['blockquote'], ['cite']), (['del'], ['cite']), (['embed'], ['pluginspage', 'src']), (['form'], ['action']), (['frame'], ['src', 'longdesc']), (['head'], ['profile']), (['iframe'], ['src', 'longdesc']), (['ilayer'], ['background']), (['img'], ['src', 'lowsrc', 'longdesc', 'usemap']), (['input'], ['src', 'usemap']), (['ins'], ['cite']), (['isindex'], ['action']), (['layer'], ['background', 'src']), (['link'], ['href']), (['meta'], ['url']), # (['object'], ['classid', 'codebase', 'data', 'archive', 'usemap']), (['q'], ['cite']), (['script'], ['src', 'for']), (['body', 'table', 'td', 'th', 'tr'], ['background']), (['xmp'], ['href']), ) LinkPatterns = [] for tags,attrs in LinkTags: attr = '(%s)'%'|'.join(attrs) tag = '(%s)'%'|'.join(tags) LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr), re.VERBOSE|re.DOTALL), 'tag': tag, 'attr': attr}) AnchorPattern = { 'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE|re.DOTALL), 'tag': 'a', 'attr': 'name', } BasePattern = { 'pattern': re.compile(_linkMatcher % ("base", "href"), re.VERBOSE), 'tag': 'base', 'attr': 'href', } #CommentPattern = re.compile("