linkchecker/linkcheck/linkparse.py
2005-01-20 12:07:04 +00:00

235 lines
7.8 KiB
Python

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2001-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Find link tags in HTML text.
"""
import re
import linkcheck
import linkcheck.strformat
import linkcheck.linkname
import linkcheck.log
MAX_NAMELEN = 256
# ripped mainly from HTML::Tagset.pm
LinkTags = {
'a': [u'href'],
'applet': [u'archive', u'src'],
'area': [u'href'],
'bgsound': [u'src'],
'blockquote': [u'cite'],
'body': [u'background'],
'del': [u'cite'],
'embed': [u'pluginspage', u'src'],
'form': [u'action'],
'frame': [u'src', u'longdesc'],
'head': [u'profile'],
'iframe': [u'src', u'longdesc'],
'ilayer': [u'background'],
'img': [u'src', u'lowsrc', u'longdesc', u'usemap'],
'input': [u'src', u'usemap'],
'ins': [u'cite'],
'isindex': [u'action'],
'layer': [u'background', u'src'],
'link': [u'href'],
'meta': [u'content'],
'object': [u'classid', u'data', u'archive', u'usemap'],
'q': [u'cite'],
'script': [u'src', u'for'],
'table': [u'background'],
'td': [u'background'],
'th': [u'background'],
'tr': [u'background'],
'xmp': [u'href'],
None: [u'style'],
}
# matcher for <meta http-equiv=refresh> tags
refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
css_url_re = re.compile(ur"url\((?P<url>[^\)]+)\)")
class TagFinder (object):
"""
Base class storing HTML parse messages in a list.
TagFinder instances are to be used as HtmlParser handlers.
"""
def __init__ (self, content):
"""
Store content in buffer.
"""
super(TagFinder, self).__init__()
self.content = content
# warnings and errors during parsing
self.parse_info = []
# parser object will be initialized when it is used as
# a handler object
self.parser = None
def _errorfun (self, msg, name):
"""
Append msg to error list.
"""
self.parse_info.append("%s at line %d col %d: %s" % \
(name, self.parser.last_lineno(), self.parser.last_column(), msg))
def warning (self, msg):
"""
Signal a filter/parser warning.
"""
self._errorfun(msg, "warning")
def error (self, msg):
"""
Signal a filter/parser error.
"""
self._errorfun(msg, "error")
def fatal_error (self, msg):
"""
Signal a fatal filter/parser error.
"""
self._errorfun(msg, "fatal error")
class MetaRobotsFinder (TagFinder):
"""
Class for finding robots.txt meta values in HTML.
"""
def __init__ (self, content):
"""
Store content in buffer and initialize flags.
"""
super(MetaRobotsFinder, self).__init__(content)
self.follow = True
self.index = True
linkcheck.log.debug(linkcheck.LOG_CHECK, "meta robots finder")
def start_element (self, tag, attrs):
"""
Search for meta robots.txt "nofollow" and "noindex" flags.
"""
if tag == 'meta':
if attrs.get('name') == 'robots':
val = attrs.get('content', u'').lower().split(u',')
self.follow = u'nofollow' not in val
self.index = u'noindex' not in val
class LinkFinder (TagFinder):
"""
Find a list of links. After parsing, self.urls
will be a list of parsed links entries with the format
(url, lineno, column, name, codebase).
"""
def __init__ (self, content, tags=None):
"""
Store content in buffer and initialize URL list.
"""
super(LinkFinder, self).__init__(content)
if tags is None:
self.tags = LinkTags
else:
self.tags = tags
self.urls = []
self.base_ref = u''
linkcheck.log.debug(linkcheck.LOG_CHECK, "link finder")
def start_element (self, tag, attrs):
"""
Search for links and store found URLs in a list.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "LinkFinder tag %s attrs %s",
tag, attrs)
linkcheck.log.debug(linkcheck.LOG_CHECK,
"line %d col %d old line %d old col %d",
self.parser.lineno(), self.parser.column(),
self.parser.last_lineno(), self.parser.last_column())
if tag == "base" and not self.base_ref:
self.base_ref = attrs.get("href", u'')
tagattrs = self.tags.get(tag, [])
tagattrs.extend(self.tags.get(None, []))
for attr in tagattrs:
if attr not in attrs:
continue
# name of this link
name = self.get_link_name(tag, attrs, attr)
# possible codebase
if tag in ('applet', 'object'):
codebase = linkcheck.strformat.unquote(
attrs.get('codebase', u''))
else:
codebase = u''
value = linkcheck.strformat.unquote(attrs[attr])
# add link to url list
self.add_link(tag, attr, value, name, codebase)
linkcheck.log.debug(linkcheck.LOG_CHECK,
"LinkFinder finished tag %s", tag)
def get_link_name (self, tag, attrs, attr):
"""
Parse attrs for link name. Return name of link.
"""
if tag == 'a' and attr == 'href':
name = linkcheck.strformat.unquote(attrs.get('title', u''))
if not name:
pos = self.parser.pos()
# Look for name only up to MAX_NAMELEN characters from current
# position, to limit the amount of data to encode.
data = self.content[pos:pos+MAX_NAMELEN]
data = data.decode(self.parser.encoding, "ignore")
name = linkcheck.linkname.href_name(data)
elif tag == 'img':
name = linkcheck.strformat.unquote(attrs.get('alt', u''))
if not name:
name = linkcheck.strformat.unquote(attrs.get('title', u''))
else:
name = u""
return name
def add_link (self, tag, attr, url, name, base):
"""
Add given url data to url list.
"""
urls = []
# look for meta refresh
if tag == 'meta':
mo = refresh_re.match(url)
if mo:
urls.append(mo.group("url"))
elif attr == 'style':
for mo in css_url_re.finditer(url):
urls.append(mo.group("url"))
else:
urls.append(url)
if not urls:
# no url found
return
for u in urls:
assert isinstance(tag, unicode), tag
assert isinstance(attr, unicode), attr
assert isinstance(u, unicode), u
assert isinstance(name, unicode), name
assert isinstance(base, unicode), base
linkcheck.log.debug(linkcheck.LOG_CHECK,
u"LinkParser add link %s %s %s %s %s", tag, attr, u, name, base)
self.urls.append((u, self.parser.last_lineno(),
self.parser.last_column(), name, base))