linkchecker/linkcheck/linkparse.py
2002-11-26 01:11:57 +00:00

111 lines
4 KiB
Python

# Copyright (C) 2001 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import re, StringUtil, linkname
from debuglevels import *
from linkcheck.parser.htmllib import HtmlParser
from linkcheck.Config import debug
# ripped mainly from HTML::Tagset.pm
LinkTags = {
'a': ['href'],
'applet': ['archive', 'src'],
'area': ['href'],
'bgsound': ['src'],
'blockquote': ['cite'],
'body': ['background'],
'del': ['cite'],
'embed': ['pluginspage', 'src'],
'form': ['action'],
'frame': ['src', 'longdesc'],
'head': ['profile'],
'iframe': ['src', 'longdesc'],
'ilayer': ['background'],
'img': ['src', 'lowsrc', 'longdesc', 'usemap'],
'input': ['src', 'usemap'],
'ins': ['cite'],
'isindex': ['action'],
'layer': ['background', 'src'],
'link': ['href'],
'meta': ['content'],
'object': ['classid', 'data', 'archive', 'usemap'],
'q': ['cite'],
'script': ['src', 'for'],
'table': ['background'],
'td': ['background'],
'th': ['background'],
'tr': ['background'],
'xmp': ['href'],
}
# matcher for <meta http-equiv=refresh> tags
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
class LinkParser (HtmlParser):
"""Parse the content for a list of links. After parsing, the urls
will have a list of parsed links entries with the format
(url, lineno, column, name, base)
"""
def __init__ (self, content, tags=LinkTags):
HtmlParser.__init__(self)
self.content = content
self.tags = tags
self.urls = []
self.feed(self.content)
debug(HURT_ME_PLENTY, "flushing")
self.flush()
def startElement (self, tag, attrs):
debug(NIGHTMARE, "LinkParser tag", tag, "attrs", attrs)
debug(ALWAYS, "line", self.lineno(), "col", self.column(),
"old line", self.last_lineno(), "old col", self.last_column())
if not self.tags.has_key(tag): return
for attr in self.tags[tag]:
if attr in attrs:
# name of this link
if tag=='a' and attr=='href':
name = StringUtil.unquote(attrs.get('title', ''))
if not name:
name = linkname.href_name(self.content[self.pos():])
elif tag=='img':
name = StringUtil.unquote(attrs.get('alt', ''))
else:
name = ""
# possible codebase
if tag in ('applet', 'object'):
base = StringUtil.unquote(attrs.get('codebase'))
else:
base = ""
# add link to url list
value = StringUtil.unquote(attrs[attr])
self.addLink(tag, attr, value, name, base)
def addLink (self, tag, attr, url, name, base):
debug(NIGHTMARE, "LinkParser add link", tag, attr, url, name, base)
# look for meta refresh
if tag=='meta':
metamatch = _refresh_re.match(url)
if metamatch:
url = metamatch.group("url")
else:
# only meta refresh has an url, so return
return
self.urls.append((url, self.last_lineno(), self.last_column(),
name, base))