mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-21 16:30:28 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@625 e7d03fd6-7b0d-0410-9947-9c21f3af8025
121 lines
3.3 KiB
Python
121 lines
3.3 KiB
Python
"""A parser for HTML"""
|
|
# Copyright (C) 2000,2001 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
import sys
|
|
try:
|
|
import htmlsax
|
|
except ImportError:
|
|
sys.stderr.write("""Could not import the `htmlsax' parser module.""")
|
|
sys.exit(1)
|
|
|
|
class HtmlParser:
|
|
"""Use an internal C SAX parser. We do not define any callbacks
|
|
here for compatibility. Currently recognized callbacks are:
|
|
comment(data): <!--data-->
|
|
startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>
|
|
endElement(tag): </tag>
|
|
doctype(data): <!DOCTYPE data?>
|
|
pi(name, data=None): <?name data?>
|
|
cdata(data): <![CDATA[data]]>
|
|
characters(data): data
|
|
|
|
additionally, there are error and warning callbacks:
|
|
error(msg)
|
|
warning(msg)
|
|
fatalError(msg)
|
|
"""
|
|
def __init__ (self):
|
|
"""initialize the internal parser"""
|
|
self.parser = htmlsax.parser(self)
|
|
|
|
def feed (self, data):
|
|
"""feed some data to the parser"""
|
|
self.parser.feed(data)
|
|
|
|
def lineno (self):
|
|
"""return current parser line number"""
|
|
return self.parser.lineno()
|
|
|
|
def last_lineno (self):
|
|
"""return parser line number of the last token"""
|
|
return self.parser.last_lineno()
|
|
|
|
def column (self):
|
|
"""return current parser column"""
|
|
return self.parser.column()
|
|
|
|
def last_column (self):
|
|
"""return parser column of the last token"""
|
|
return self.parser.last_column()
|
|
|
|
def pos (self):
|
|
"""return current parser buffer position"""
|
|
return self.parser.pos()
|
|
|
|
def flush (self):
|
|
"""flush all data"""
|
|
self.parser.flush()
|
|
|
|
def reset (self):
|
|
"""reset the parser (without flushing)"""
|
|
self.parser.reset()
|
|
|
|
|
|
class HtmlPrinter (HtmlParser):
|
|
"""handles all functions by printing the function name and
|
|
attributes"""
|
|
def __getattr__ (self, name):
|
|
self.mem = name
|
|
return self._print
|
|
|
|
def _print (self, *attrs):
|
|
print self.mem, attrs, self.last_lineno(), self.last_column()
|
|
|
|
|
|
def _test():
|
|
p = HtmlPrinter()
|
|
p.feed("<hTml>")
|
|
p.feed("<a href>")
|
|
p.feed("<a href=''>")
|
|
p.feed('<a href="">')
|
|
p.feed("<a href='a'>")
|
|
p.feed('<a href="a">')
|
|
p.feed("<a href=a>")
|
|
p.feed("<a href='\"'>")
|
|
p.feed("<a href=\"'\">")
|
|
p.feed("<a href=' '>")
|
|
p.feed("<a href=a href=b>")
|
|
p.feed("<a/>")
|
|
p.feed("<a href/>")
|
|
p.feed("<a href=a />")
|
|
p.feed("</a>")
|
|
p.feed("<?bla foo?>")
|
|
p.feed("<?bla?>")
|
|
p.feed("<!-- - comment -->")
|
|
p.feed("<!---->")
|
|
p.feed("<!DOCTYPE \"vla foo>")
|
|
p.flush()
|
|
|
|
def _broken ():
|
|
p = HtmlPrinter()
|
|
p.feed("<img bo\\\nrder=0>")
|
|
p.flush()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
#_test()
|
|
_broken()
|