mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-20 07:50:24 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1300 e7d03fd6-7b0d-0410-9947-9c21f3af8025
166 lines
5.5 KiB
Python
166 lines
5.5 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2001-2004 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
import re, StringUtil, linkname
|
|
from debug import *
|
|
|
|
# ripped mainly from HTML::Tagset.pm
|
|
LinkTags = {
|
|
'a': ['href'],
|
|
'applet': ['archive', 'src'],
|
|
'area': ['href'],
|
|
'bgsound': ['src'],
|
|
'blockquote': ['cite'],
|
|
'body': ['background'],
|
|
'del': ['cite'],
|
|
'embed': ['pluginspage', 'src'],
|
|
'form': ['action'],
|
|
'frame': ['src', 'longdesc'],
|
|
'head': ['profile'],
|
|
'iframe': ['src', 'longdesc'],
|
|
'ilayer': ['background'],
|
|
'img': ['src', 'lowsrc', 'longdesc', 'usemap'],
|
|
'input': ['src', 'usemap'],
|
|
'ins': ['cite'],
|
|
'isindex': ['action'],
|
|
'layer': ['background', 'src'],
|
|
'link': ['href'],
|
|
'meta': ['content'],
|
|
'object': ['classid', 'data', 'archive', 'usemap'],
|
|
'q': ['cite'],
|
|
'script': ['src', 'for'],
|
|
'table': ['background'],
|
|
'td': ['background'],
|
|
'th': ['background'],
|
|
'tr': ['background'],
|
|
'xmp': ['href'],
|
|
None: ['style'],
|
|
}
|
|
|
|
# matcher for <meta http-equiv=refresh> tags
|
|
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
|
|
css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")
|
|
|
|
class TagFinder (object):
|
|
"""base class storing parse messages in a list"""
|
|
def __init__ (self, content):
|
|
self.content = content
|
|
# warnings and errors during parsing
|
|
self.parse_info = []
|
|
|
|
|
|
def _errorfun (self, msg, name):
|
|
"""append msg to error list"""
|
|
self.parse_info.append("%s at line %d col %d: %s" % \
|
|
(name, self.parser.last_lineno(), self.parser.last_column(), msg))
|
|
|
|
|
|
def error (self, msg):
|
|
"""signal a filter/parser error"""
|
|
self._errorfun(msg, "error")
|
|
|
|
|
|
def warning (self, msg):
|
|
"""signal a filter/parser warning"""
|
|
self._errorfun(msg, "warning")
|
|
|
|
|
|
def fatalError (self, msg):
|
|
"""signal a fatal filter/parser error"""
|
|
self._errorfun(msg, "fatal error")
|
|
|
|
|
|
class MetaRobotsFinder (TagFinder):
|
|
"""class for finding robots.txt meta values in HTML"""
|
|
def __init__ (self, content):
|
|
super(MetaRobotsFinder, self).__init__(content)
|
|
self.follow = True
|
|
self.index = True
|
|
|
|
|
|
def startElement (self, tag, attrs):
|
|
if tag=='meta':
|
|
if attrs.get('name')=='robots':
|
|
val = attrs.get('content', '').lower().split(',')
|
|
self.follow = 'nofollow' not in val
|
|
self.index = 'noindex' not in val
|
|
|
|
|
|
class LinkFinder (TagFinder):
|
|
"""find a list of links. After parsing, the urls
|
|
will have a list of parsed links entries with the format
|
|
(url, lineno, column, name, base)
|
|
"""
|
|
|
|
def __init__ (self, content, tags=LinkTags):
|
|
super(LinkFinder, self).__init__(content)
|
|
self.tags = tags
|
|
self.urls = []
|
|
|
|
|
|
def startElement (self, tag, attrs):
|
|
debug(NIGHTMARE, "LinkFinder tag", tag, "attrs", attrs)
|
|
debug(NIGHTMARE, "line", self.parser.lineno(),
|
|
"col", self.parser.column(),
|
|
"old line", self.parser.last_lineno(),
|
|
"old col", self.parser.last_column())
|
|
tagattrs = self.tags.get(tag, [])
|
|
tagattrs.extend(self.tags.get(None, []))
|
|
for attr in tagattrs:
|
|
if attr in attrs:
|
|
# name of this link
|
|
if tag=='a' and attr=='href':
|
|
name = StringUtil.unquote(attrs.get('title', ''))
|
|
if not name:
|
|
name = linkname.href_name(self.content[self.parser.pos():])
|
|
elif tag=='img':
|
|
name = StringUtil.unquote(attrs.get('alt', ''))
|
|
if not name:
|
|
name = StringUtil.unquote(attrs.get('title', ''))
|
|
else:
|
|
name = ""
|
|
# possible codebase
|
|
if tag in ('applet', 'object'):
|
|
base = StringUtil.unquote(attrs.get('codebase'))
|
|
else:
|
|
base = ""
|
|
value = StringUtil.unquote(attrs[attr])
|
|
# add link to url list
|
|
self.addLink(tag, attr, value, name, base)
|
|
|
|
|
|
def addLink (self, tag, attr, url, name, base):
|
|
urls = []
|
|
# look for meta refresh
|
|
if tag=='meta':
|
|
mo = _refresh_re.match(url)
|
|
if mo:
|
|
urls.append(mo.group("url"))
|
|
elif attr=='style':
|
|
for mo in css_url_re.finditer(url):
|
|
urls.append(mo.group("url"))
|
|
else:
|
|
urls.append(url)
|
|
if not urls:
|
|
# no url found
|
|
return
|
|
for u in urls:
|
|
debug(NIGHTMARE, "LinkParser add link", tag, attr, u, name, base)
|
|
self.urls.append((u, self.parser.last_lineno(),
|
|
self.parser.last_column(), name, base))
|
|
|
|
|