2003-07-04 14:24:44 +00:00
|
|
|
# -*- coding: iso-8859-1 -*-
|
2002-11-23 23:09:11 +00:00
|
|
|
"""A parser for HTML"""
|
2004-01-03 14:59:33 +00:00
|
|
|
# Copyright (C) 2000-2004 Bastian Kleineidam
|
2002-11-23 23:09:11 +00:00
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
|
|
2004-01-28 22:33:34 +00:00
|
|
|
__version__ = "$Revision$"[11:-2]
|
|
|
|
|
__date__ = "$Date$"[7:-2]
|
|
|
|
|
|
2002-11-23 23:09:11 +00:00
|
|
|
import sys
|
|
|
|
|
try:
|
|
|
|
|
import htmlsax
|
2004-01-28 22:33:34 +00:00
|
|
|
except ImportError, msg:
|
2003-01-09 20:42:19 +00:00
|
|
|
exctype, value = sys.exc_info()[:2]
|
2003-04-30 15:03:47 +00:00
|
|
|
print >>sys.stderr, "Could not import the parser module `htmlsax':", value
|
2003-01-09 21:36:52 +00:00
|
|
|
print >>sys.stderr, "Please check your installation of LinkChecker."
|
2002-11-23 23:09:11 +00:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
2003-04-30 13:56:07 +00:00
|
|
|
|
2004-01-28 22:33:34 +00:00
|
|
|
class HtmlPrinter (object):
|
|
|
|
|
"""handles all functions by printing the function name and attributes"""
|
2003-04-30 01:22:00 +00:00
|
|
|
def _print (self, *attrs):
|
2004-01-28 22:33:34 +00:00
|
|
|
print self.mem, attrs
|
2003-04-30 01:22:00 +00:00
|
|
|
|
2003-08-11 13:19:39 +00:00
|
|
|
|
2003-04-30 01:22:00 +00:00
|
|
|
def _errorfun (self, msg, name):
|
|
|
|
|
"""print msg to stderr with name prefix"""
|
2004-01-28 22:33:34 +00:00
|
|
|
print >> sys.stderr, name, msg
|
2003-04-30 01:22:00 +00:00
|
|
|
|
2003-08-11 13:19:39 +00:00
|
|
|
|
2003-04-30 01:22:00 +00:00
|
|
|
def error (self, msg):
|
|
|
|
|
"""signal a filter/parser error"""
|
|
|
|
|
self._errorfun(msg, "error:")
|
|
|
|
|
|
2003-08-11 13:19:39 +00:00
|
|
|
|
2003-04-30 01:22:00 +00:00
|
|
|
def warning (self, msg):
|
|
|
|
|
"""signal a filter/parser warning"""
|
|
|
|
|
self._errorfun(msg, "warning:")
|
|
|
|
|
|
2003-08-11 13:19:39 +00:00
|
|
|
|
2003-04-30 01:22:00 +00:00
|
|
|
def fatalError (self, msg):
|
|
|
|
|
"""signal a fatal filter/parser error"""
|
|
|
|
|
self._errorfun(msg, "fatal error:")
|
|
|
|
|
|
2003-08-11 13:19:39 +00:00
|
|
|
|
2002-11-23 23:09:11 +00:00
|
|
|
def __getattr__ (self, name):
|
2004-01-28 22:33:34 +00:00
|
|
|
"""remember the func name"""
|
2002-11-23 23:09:11 +00:00
|
|
|
self.mem = name
|
|
|
|
|
return self._print
|
|
|
|
|
|
|
|
|
|
|
2004-01-28 22:33:34 +00:00
|
|
|
def quote_attrval (val):
|
|
|
|
|
"""quote a HTML attribute to be able to wrap it in double quotes"""
|
|
|
|
|
return val.replace('"', '"')
|
|
|
|
|
|
|
|
|
|
|
2002-11-23 23:09:11 +00:00
|
|
|
def _test():
|
2004-01-28 22:33:34 +00:00
|
|
|
p = htmlsax.parser(HtmlPrinter())
|
2002-11-23 23:09:11 +00:00
|
|
|
p.feed("<hTml>")
|
|
|
|
|
p.feed("<a href>")
|
|
|
|
|
p.feed("<a href=''>")
|
|
|
|
|
p.feed('<a href="">')
|
|
|
|
|
p.feed("<a href='a'>")
|
|
|
|
|
p.feed('<a href="a">')
|
|
|
|
|
p.feed("<a href=a>")
|
|
|
|
|
p.feed("<a href='\"'>")
|
|
|
|
|
p.feed("<a href=\"'\">")
|
|
|
|
|
p.feed("<a href=' '>")
|
|
|
|
|
p.feed("<a href=a href=b>")
|
|
|
|
|
p.feed("<a/>")
|
|
|
|
|
p.feed("<a href/>")
|
|
|
|
|
p.feed("<a href=a />")
|
|
|
|
|
p.feed("</a>")
|
|
|
|
|
p.feed("<?bla foo?>")
|
|
|
|
|
p.feed("<?bla?>")
|
|
|
|
|
p.feed("<!-- - comment -->")
|
|
|
|
|
p.feed("<!---->")
|
|
|
|
|
p.feed("<!DOCTYPE \"vla foo>")
|
|
|
|
|
p.flush()
|
|
|
|
|
|
|
|
|
|
def _broken ():
|
2004-01-28 22:33:34 +00:00
|
|
|
p = htmlsax.parser(HtmlPrinter())
|
|
|
|
|
# turn on debugging
|
|
|
|
|
p.debug(1)
|
|
|
|
|
p.feed("""<base href="http://www.msnbc.com/news/">""")
|
2002-11-23 23:09:11 +00:00
|
|
|
p.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
#_test()
|
|
|
|
|
_broken()
|