"""Base URL handler""" # Copyright (C) 2000,2001 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sys, re, urlparse, urllib, time, traceback, socket, select, DNS import Config, StringUtil, linkcheck, linkname, test_support, timeoutsocket from debuglevels import * debug = Config.debug # helper function for internal errors def internal_error (): print >> sys.stderr, linkcheck._("""\n********** Oops, I did it again. ************* You have found an internal error in LinkChecker. Please write a bug report at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913 or send mail to %s and include the following information: 1) The URL or file you are testing 2) Your commandline arguments and/or configuration. 3) The system information below. If you disclose some information because its too private to you thats ok. I will try to help you nontheless (but you have to give me *something* I can work with ;). """) % Config.Email type,value = sys.exc_info()[:2] print >> sys.stderr, type, value traceback.print_exc() print_app_info() print >> sys.stderr, linkcheck._("\n******** LinkChecker internal error, bailing out ********") sys.exit(1) def print_app_info (): import os print >> sys.stderr, linkcheck._("System info:") print >> sys.stderr, Config.App print >> sys.stderr, "Python %s on %s" % (sys.version, sys.platform) for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"): value = os.getenv(key) if value is not None: print >> sys.stderr, key, "=", `value` # we catch these exceptions, all other exceptions are internal # or system errors ExcList = [ IOError, ValueError, # from httplib.py linkcheck.error, DNS.Error, timeoutsocket.Timeout, socket.error, select.error, ] if hasattr(socket, "sslerror"): ExcList.append(socket.sslerror) # regular expression to match an HTML tag with one given attribute _linkMatcher = r""" (?i) # case insensitive < # open tag \s* # whitespace %s # tag name \s+ # whitespace ([^"'>]|"[^"\n]*"|'[^'\n]*')* # skip leading attributes %s # attrib name \s* # whitespace = # equal sign \s* # whitespace (?P # attribute value "[^"\n]*" | # in double quotes '[^'\n]*' | # in single quotes [^\s>]+) # unquoted ([^"'>]|"[^"\n]*"|'[^'\n]*')* # skip trailing attributes > # close tag """ # ripped mainly from HTML::Tagset.pm LinkTags = ( (['a'], ['href']), (['applet'], ['archive', 'codebase', 'src']), (['area'], ['href']), (['bgsound'], ['src']), (['blockquote'], ['cite']), (['del'], ['cite']), (['embed'], ['pluginspage', 'src']), (['form'], ['action']), (['frame'], ['src', 'longdesc']), (['head'], ['profile']), (['iframe'], ['src', 'longdesc']), (['ilayer'], ['background']), (['img'], ['src', 'lowsrc', 'longdesc', 'usemap']), (['input'], ['src', 'usemap']), (['ins'], ['cite']), (['isindex'], ['action']), (['layer'], ['background', 'src']), (['link'], ['href']), (['object'], ['classid', 'codebase', 'data', 'archive', 'usemap']), (['q'], ['cite']), (['script'], ['src', 'for']), (['body', 'table', 'td', 'th', 'tr'], ['background']), (['xmp'], ['href']), (['meta'], ['content']), ) # matcher for tags _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P.+)$") LinkPatterns = [] for _tags,_attrs in LinkTags: _tag = '(%s)'%'|'.join(_tags) _attr = '(%s)'%'|'.join(_attrs) LinkPatterns.append({'pattern': re.compile(_linkMatcher % (_tag, _attr), re.VERBOSE), 'tags': _tags, 'attrs': _attrs}) AnchorPattern = { 'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE), 'tags': ['a'], 'attrs': ['name'], } BasePattern = { 'pattern': re.compile(_linkMatcher % ("base", "href"), re.VERBOSE), 'tags': ['base'], 'attrs': ['href'], } #CommentPattern = re.compile("