"""Base URL handler""" # Copyright (C) 2000,2001 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sys,re,string,urlparse,urllib,time,DNS import Config,StringUtil,linkcheck,linkname from linkcheck import _ debug = linkcheck.Config.debug from debuglevels import * ExcList = [ IOError, ValueError, # from httplib.py linkcheck.error, EOFError, # from ftplib.py DNS.Error, ] try: import socket ExcList.append(socket.error) except ImportError: pass _linkMatcher = r""" (?i) # case insensitive < # open tag \s* # whitespace %s # tag name \s+ # whitespace [^>]*? # skip leading attributes %s # attrib name \s* # whitespace = # equal sign \s* # whitespace (?P # attribute value ".*?" | # in double quotes '.*?' | # in single quotes [^\s>]+) # unquoted ([^">]|".*?")* # skip trailing attributes > # close tag """ # ripped mainly from HTML::Tagset.pm LinkTags = ( ("a", ["href"]), ("applet", ["archive", "codebase", "src"]), ("area", ["href"]), ("bgsound", ["src"]), ("blockquote", ["cite"]), ("body", ["background"]), ("del", ["cite"]), ("embed", ["pluginspage", "src"]), ("form", ["action"]), ("frame", ["src", "longdesc"]), ('head', ['profile']), ("iframe", ["src", "longdesc"]), ("ilayer", ["background"]), ("img", ["src", "lowsrc", "longdesc", "usemap"]), ('input', ['src', 'usemap']), ('ins', ['cite']), ('isindex', ['action']), ('layer', ['background', 'src']), ("link", ["href"]), ("meta", ["url"]), # ('object', ['classid', 'codebase', 'data', 'archive', 'usemap']), ('q', ['cite']), ('script', ['src', 'for']), ('table', ['background']), ('td', ['background']), ('th', ['background']), ('tr', ['background']), ('xmp', ['href']), ) LinkPatterns = [] for tag,attrs in LinkTags: for attr in attrs: LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr), re.VERBOSE|re.DOTALL), 'tag': tag, 'attr': attr}) AnchorPattern = { 'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE|re.DOTALL), 'tag': 'a', 'attr': 'name', } BasePattern = { 'pattern': re.compile(_linkMatcher % ("base", "href"), re.VERBOSE), 'tag': 'base', 'attr': 'href', } CommentPattern = re.compile("