linkchecker/linkcheck/parser/htmllib.py
2002-11-26 01:06:49 +00:00

121 lines
3.3 KiB
Python

"""A parser for HTML"""
# Copyright (C) 2000,2001 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import sys
try:
import htmlsax
except ImportError:
sys.stderr.write("""Could not import the `htmlsax' parser module.""")
sys.exit(1)
class HtmlParser:
"""Use an internal C SAX parser. We do not define any callbacks
here for compatibility. Currently recognized callbacks are:
comment(data): <!--data-->
startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>
endElement(tag): </tag>
doctype(data): <!DOCTYPE data?>
pi(name, data=None): <?name data?>
cdata(data): <![CDATA[data]]>
characters(data): data
additionally, there are error and warning callbacks:
error(msg)
warning(msg)
fatalError(msg)
"""
def __init__ (self):
"""initialize the internal parser"""
self.parser = htmlsax.parser(self)
def feed (self, data):
"""feed some data to the parser"""
self.parser.feed(data)
def lineno (self):
"""return current parser line number"""
return self.parser.lineno()
def last_lineno (self):
"""return parser line number of the last token"""
return self.parser.last_lineno()
def column (self):
"""return current parser column"""
return self.parser.column()
def last_column (self):
"""return parser column of the last token"""
return self.parser.last_column()
def pos (self):
"""return current parser buffer position"""
return self.parser.pos()
def flush (self):
"""flush all data"""
self.parser.flush()
def reset (self):
"""reset the parser (without flushing)"""
self.parser.reset()
class HtmlPrinter (HtmlParser):
"""handles all functions by printing the function name and
attributes"""
def __getattr__ (self, name):
self.mem = name
return self._print
def _print (self, *attrs):
print self.mem, attrs, self.last_lineno(), self.last_column()
def _test():
p = HtmlPrinter()
p.feed("<hTml>")
p.feed("<a href>")
p.feed("<a href=''>")
p.feed('<a href="">')
p.feed("<a href='a'>")
p.feed('<a href="a">')
p.feed("<a href=a>")
p.feed("<a href='\"'>")
p.feed("<a href=\"'\">")
p.feed("<a href=' '>")
p.feed("<a href=a href=b>")
p.feed("<a/>")
p.feed("<a href/>")
p.feed("<a href=a />")
p.feed("</a>")
p.feed("<?bla foo?>")
p.feed("<?bla?>")
p.feed("<!-- - comment -->")
p.feed("<!---->")
p.feed("<!DOCTYPE \"vla foo>")
p.flush()
def _broken ():
p = HtmlPrinter()
p.feed("<img bo\\\nrder=0>")
p.flush()
if __name__ == '__main__':
#_test()
_broken()