mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-22 08:50:24 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1008 e7d03fd6-7b0d-0410-9947-9c21f3af8025
123 lines
3.4 KiB
Python
123 lines
3.4 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
"""A parser for HTML"""
|
|
# Copyright (C) 2000-2003 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
import sys
|
|
try:
|
|
import htmlsax
|
|
except ImportError:
|
|
exctype, value = sys.exc_info()[:2]
|
|
print >>sys.stderr, "Could not import the parser module `htmlsax':", value
|
|
print >>sys.stderr, "Please check your installation of LinkChecker."
|
|
sys.exit(1)
|
|
|
|
|
|
class HtmlParser (object):
|
|
"""Use an internal C SAX parser. We do not define any callbacks
|
|
here for compatibility. Currently recognized callbacks are:
|
|
comment(data): <!--data-->
|
|
startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>
|
|
endElement(tag): </tag>
|
|
doctype(data): <!DOCTYPE data?>
|
|
pi(name, data=None): <?name data?>
|
|
cdata(data): <![CDATA[data]]>
|
|
characters(data): data
|
|
|
|
additionally, there are error and warning callbacks:
|
|
error(msg)
|
|
warning(msg)
|
|
fatalError(msg)
|
|
"""
|
|
def __init__ (self):
|
|
"""initialize the internal parser"""
|
|
self.parser = htmlsax.parser(self)
|
|
|
|
|
|
def __getattr__ (self, name):
|
|
"""delegate unknown attrs to self.parser"""
|
|
return getattr(self.parser, name)
|
|
|
|
|
|
class HtmlPrinter (HtmlParser):
|
|
"""handles all functions by printing the function name and
|
|
attributes"""
|
|
def _print (self, *attrs):
|
|
print self.mem, attrs, self.last_lineno(), self.last_column()
|
|
|
|
|
|
def _errorfun (self, msg, name):
|
|
"""print msg to stderr with name prefix"""
|
|
pos = "%d:%d:" % (self.lineno(), self.column())
|
|
print >> sys.stderr, name, pos, msg
|
|
|
|
|
|
def error (self, msg):
|
|
"""signal a filter/parser error"""
|
|
self._errorfun(msg, "error:")
|
|
|
|
|
|
def warning (self, msg):
|
|
"""signal a filter/parser warning"""
|
|
self._errorfun(msg, "warning:")
|
|
|
|
|
|
def fatalError (self, msg):
|
|
"""signal a fatal filter/parser error"""
|
|
self._errorfun(msg, "fatal error:")
|
|
|
|
|
|
def __getattr__ (self, name):
|
|
if hasattr(self.parser, name):
|
|
return getattr(self.parser, name)
|
|
self.mem = name
|
|
return self._print
|
|
|
|
|
|
def _test():
|
|
p = HtmlPrinter()
|
|
p.feed("<hTml>")
|
|
p.feed("<a href>")
|
|
p.feed("<a href=''>")
|
|
p.feed('<a href="">')
|
|
p.feed("<a href='a'>")
|
|
p.feed('<a href="a">')
|
|
p.feed("<a href=a>")
|
|
p.feed("<a href='\"'>")
|
|
p.feed("<a href=\"'\">")
|
|
p.feed("<a href=' '>")
|
|
p.feed("<a href=a href=b>")
|
|
p.feed("<a/>")
|
|
p.feed("<a href/>")
|
|
p.feed("<a href=a />")
|
|
p.feed("</a>")
|
|
p.feed("<?bla foo?>")
|
|
p.feed("<?bla?>")
|
|
p.feed("<!-- - comment -->")
|
|
p.feed("<!---->")
|
|
p.feed("<!DOCTYPE \"vla foo>")
|
|
p.flush()
|
|
|
|
|
|
def _broken ():
|
|
p = HtmlPrinter()
|
|
p.feed("<img bo\\\nrder=0>")
|
|
p.flush()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
#_test()
|
|
_broken()
|