"""A parser for HTML""" # Copyright (C) 2000,2001 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sys try: import htmlsax except ImportError: sys.stderr.write("""Could not import the `htmlsax' parser module.""") sys.exit(1) class HtmlParser: """Use an internal C SAX parser. We do not define any callbacks here for compatibility. Currently recognized callbacks are: comment(data): startElement(tag, attrs): endElement(tag): doctype(data): pi(name, data=None): cdata(data): characters(data): data additionally, there are error and warning callbacks: error(msg) warning(msg) fatalError(msg) """ def __init__ (self): """initialize the internal parser""" self.parser = htmlsax.parser(self) def feed (self, data): """feed some data to the parser""" self.parser.feed(data) def lineno (self): """return current parser line number""" return self.parser.lineno() def last_lineno (self): """return parser line number of the last token""" return self.parser.last_lineno() def column (self): """return current parser column""" return self.parser.column() def last_column (self): """return parser column of the last token""" return self.parser.last_column() def pos (self): """return current parser buffer position""" return self.parser.pos() def flush (self): """flush all data""" self.parser.flush() def reset (self): """reset the parser (without flushing)""" self.parser.reset() class HtmlPrinter (HtmlParser): """handles all functions by printing the function name and attributes""" def __getattr__ (self, name): self.mem = name return self._print def _print (self, *attrs): print self.mem, attrs, self.last_lineno(), self.last_column() def _test(): p = HtmlPrinter() p.feed("") p.feed("") p.feed("") p.feed('') p.feed("") p.feed('') p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.feed("") p.flush() def _broken (): p = HtmlPrinter() p.feed("") p.flush() if __name__ == '__main__': #_test() _broken()