Remove support for start_end_element() callback

The LinkFinder handler start_end_element() callback does nothing apart
from call start_element().
This commit is contained in:
Chris Mayo 2020-04-09 20:15:15 +01:00
parent c9f17e92b9
commit eb3cf28baa
5 changed files with 61 additions and 104 deletions

View file

@ -33,13 +33,6 @@ Used callbacks of a handler are:
@param attrs: tag attributes
@type attrs: ListDict
- Start-end tag: <tag {attr1:value1, attr2:value2, ..}/>
def start_end_element(tag, attrs):
@param tag: tag name
@type tag: Unicode string
@param attrs: tag attributes
@type attrs: ListDict
Additionally, there are error and warning callbacks:
- Parser warning.

View file

@ -42,20 +42,14 @@ class Parser(object):
def parse_contents(self, contents):
for content in contents:
if isinstance(content, Tag):
tag_column = None if content.sourcepos is None \
self.handler.start_element(
content.name, content.attrs, content.text.strip(),
content.sourceline,
None if content.sourcepos is None
else content.sourcepos + 1
if content.is_empty_element:
self.handler.start_end_element(
content.name, content.attrs, content.text.strip(),
content.sourceline, tag_column
)
else:
self.handler.start_element(
content.name, content.attrs, content.text.strip(),
content.sourceline, tag_column
)
if hasattr(content, 'contents'): # recursion
self.parse_contents(content.contents)
)
if hasattr(content, 'contents'): # recursion
self.parse_contents(content.contents)
def parser(handler=None):

View file

@ -109,11 +109,6 @@ class TagFinder (object):
"""Does nothing, override in a subclass."""
pass
def start_end_element (self, tag, attrs, element_text, lineno, column):
"""Delegate a combined start/end element (eg. <br/>) to
the start_element method. Ignore the end element part."""
self.start_element(tag, attrs, element_text, lineno, column)
class MetaRobotsFinder (TagFinder):
"""Class for finding robots.txt meta values in HTML."""

View file

@ -49,42 +49,16 @@ class HtmlPrettyPrinter:
@type attrs: dict
@return: None
"""
self._start_element(tag, attrs, ">", element_text)
self.fd.write("</%s>" % tag)
def start_end_element (self, tag, attrs, element_text, lineno, column):
"""
Print HTML start-end element.
@param tag: tag name
@type tag: string
@param attrs: tag attributes
@type attrs: dict
@return: None
"""
self._start_element(tag, attrs, "/>", element_text)
def _start_element (self, tag, attrs, end, element_text):
"""
Print HTML element with end string.
@param tag: tag name
@type tag: string
@param attrs: tag attributes
@type attrs: dict
@param end: either > or />
@type end: string
@return: None
"""
self.fd.write("<%s" % tag.replace("/", ""))
for key, val in sorted(attrs.items()):
if val is None:
self.fd.write(" %s" % key)
else:
self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
self.fd.write(end)
if element_text:
self.fd.write(element_text)
self.fd.write(">%s</%s>" % (element_text, tag))
else:
self.fd.write("/>")
def quote_attrval (s):

View file

@ -31,42 +31,42 @@ from .htmllib import HtmlPrettyPrinter
# (<test pattern>, <expected parse output>)
parsetests = [
# start tags
("""<a b="c" >""", """<a b="c"></a>"""),
("""<a b='c' >""", """<a b="c"></a>"""),
("""<a b=c" >""", """<a b="c&quot;"></a>"""),
("""<a b=c' >""", """<a b="c'"></a>"""),
("""<a b="" >""", """<a b=""></a>"""),
("""<a b='' >""", """<a b=""></a>"""),
("""<a b=>""", """<a b=""></a>"""),
("""<a b= >""", """<a b=""></a>"""),
("""<a =c>""", """<a =c=""></a>"""),
("""<a =c >""", """<a =c=""></a>"""),
("""<a =>""", """<a ==""></a>"""),
("""<a = >""", """<a ==""></a>"""),
("""<a b= "c" >""", """<a b="c"></a>"""),
("""<a b ="c" >""", """<a b="c"></a>"""),
("""<a b = "c" >""", """<a b="c"></a>"""),
("""<a >""", """<a></a>"""),
("""<a b="c" >""", """<a b="c"/>"""),
("""<a b='c' >""", """<a b="c"/>"""),
("""<a b=c" >""", """<a b="c&quot;"/>"""),
("""<a b=c' >""", """<a b="c'"/>"""),
("""<a b="" >""", """<a b=""/>"""),
("""<a b='' >""", """<a b=""/>"""),
("""<a b=>""", """<a b=""/>"""),
("""<a b= >""", """<a b=""/>"""),
("""<a =c>""", """<a =c=""/>"""),
("""<a =c >""", """<a =c=""/>"""),
("""<a =>""", """<a ==""/>"""),
("""<a = >""", """<a ==""/>"""),
("""<a b= "c" >""", """<a b="c"/>"""),
("""<a b ="c" >""", """<a b="c"/>"""),
("""<a b = "c" >""", """<a b="c"/>"""),
("""<a >""", """<a/>"""),
("""<>""", """"""),
("""< >""", """"""),
("""<aä>""", u"""<aä></>"""),
("""<a aä="b">""", u"""<a aä="b"></a>"""),
("""<a a="">""", u"""<a a="b&#228;"></a>"""),
("""<aä>""", u"""<aä/>"""),
("""<a aä="b">""", u"""<a aä="b"/>"""),
("""<a a="">""", u"""<a a="b&#228;"/>"""),
# multiple attribute names should be ignored...
("""<a b="c" b="c" >""", """<a b="c"></a>"""),
("""<a b="c" b="c" >""", """<a b="c"/>"""),
# ... but which one wins - in our implementation the last one
("""<a b="c" b="d" >""", """<a b="d"></a>"""),
("""<a b="c" b="d" >""", """<a b="d"/>"""),
# reduce test
("""<a b="c"><""", """<a b="c"><</a>"""),
# numbers in tag
("""<h1>bla</h1>""", """<h1>bla</h1>"""),
# more start tags
("""<a b=c"><a b="c">""", """<a b="c&quot;"><a b="c"></a></a>"""),
("""<a b=/c/></a><br>""", """<a b="/c/"></a><br/>"""),
("""<a b=c"><a b="c">""", """<a b="c&quot;"/><a b="c"/>"""),
("""<a b=/c/></a><br>""", """<a b="/c/"/><br/>"""),
("""<br/>""", """<br/>"""),
("""<a b="50%"><br>""", """<a b="50%"><br/></a>"""),
("""<a b="50%"><br>""", """<a b="50%"/><br/>"""),
# start and end tag (HTML doctype assumed)
("""<a/>""", """<a></a>"""),
("""<a/>""", """<a/>"""),
("""<meta/>""", """<meta/>"""),
("""<MetA/>""", """<meta/>"""),
# line continuation (Dr. Fun webpage)
@ -74,30 +74,30 @@ parsetests = [
("""<img align="mid\\\ndle">""", """<img align="mid\\\ndle"/>"""),
("""<img align='mid\\\ndle'>""", """<img align="mid\\\ndle"/>"""),
# href with $
("""<a href="123$456">""", """<a href="123$456"></a>"""),
("""<a href="123$456">""", """<a href="123$456"/>"""),
# quoting
("""<a href=/ >""", """<a href="/"></a>"""),
("""<a href= />""", """<a href="/"></a>"""),
("""<a href= >""", """<a href=""></a>"""),
("""<a href="'" >""", """<a href="'"></a>"""),
("""<a href='"' >""", """<a href="&quot;"></a>"""),
("""<a href="bla" %]" >""", """<a %]"="" href="bla"></a>"""),
("""<a href=bla" >""", """<a href="bla&quot;"></a>"""),
("""<a href=/ >""", """<a href="/"/>"""),
("""<a href= />""", """<a href="/"/>"""),
("""<a href= >""", """<a href=""/>"""),
("""<a href="'" >""", """<a href="'"/>"""),
("""<a href='"' >""", """<a href="&quot;"/>"""),
("""<a href="bla" %]" >""", """<a %]"="" href="bla"/>"""),
("""<a href=bla" >""", """<a href="bla&quot;"/>"""),
("""<a onmouseover=blubb('nav1','',"""\
"""'/images/nav.gif',1);move(this); b="c">""",
"""<a b="c" onmouseover="blubb('nav1','',"""\
"""'/images/nav.gif',1);move(this);"></a>"""),
"""'/images/nav.gif',1);move(this);"/>"""),
("""<a onClick=location.href('/index.htm') b="c">""",
"""<a b="c" onclick="location.href('/index.htm')"></a>"""),
"""<a b="c" onclick="location.href('/index.htm')"/>"""),
# entity resolving
("""<a href="&#6D;ailto:" >""", """<a href="D;ailto:"></a>"""),
("""<a href="&amp;ailto:" >""", """<a href="&amp;ailto:"></a>"""),
("""<a href="&amp;amp;ailto:" >""", """<a href="&amp;amp;ailto:"></a>"""),
("""<a href="&hulla;ailto:" >""", """<a href="&amp;hulla;ailto:"></a>"""),
("""<a href="&#109;ailto:" >""", """<a href="mailto:"></a>"""),
("""<a href="&#x6D;ailto:" >""", """<a href="mailto:"></a>"""),
("""<a href="&#6D;ailto:" >""", """<a href="D;ailto:"/>"""),
("""<a href="&amp;ailto:" >""", """<a href="&amp;ailto:"/>"""),
("""<a href="&amp;amp;ailto:" >""", """<a href="&amp;amp;ailto:"/>"""),
("""<a href="&hulla;ailto:" >""", """<a href="&amp;hulla;ailto:"/>"""),
("""<a href="&#109;ailto:" >""", """<a href="mailto:"/>"""),
("""<a href="&#x6D;ailto:" >""", """<a href="mailto:"/>"""),
# note that \u8156 is not valid encoding and therefore gets removed
("""<a href="&#8156;ailto:" >""", """<a href="&#8156;ailto:"></a>"""),
("""<a href="&#8156;ailto:" >""", """<a href="&#8156;ailto:"/>"""),
# mailto link
("""<a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>""",
"""<a href="mailto:calvin@LocalHost?subject=Hallo&amp;to=michi">1</a>"""),
@ -116,19 +116,20 @@ parsetests = [
("""</td <td a="b" >""", """"""),
("""</td<td a="b" >""", """"""),
# missing beginning quote
("""<td a=b">""", """<td a="b&quot;"></td>"""),
("""<td a=b">""", """<td a="b&quot;"/>"""),
# stray < before start tag
("""<0.<td a="b" >""", """<td a="b"></td>"""),
("""<0.<td a="b" >""", """<td a="b"/>"""),
# HTML5 tags
("""<audio src=bla>""", """<audio src="bla"></audio>"""),
("""<button formaction=bla>""", """<button formaction="bla"></button>"""),
("""<html manifest=bla>""", """<html manifest="bla"></html>"""),
("""<audio src=bla>""", """<audio src="bla"/>"""),
("""<button formaction=bla>""", """<button formaction="bla"/>"""),
("""<html manifest=bla>""", """<html manifest="bla"/>"""),
("""<source src=bla>""", """<source src="bla"/>"""),
("""<track src=bla>""", """<track src="bla"/>"""),
("""<video src=bla>""", """<video src="bla"></video>"""),
("""<video src=bla>""", """<video src="bla"/>"""),
# Test inserted tag s
("""<b><a></a></b>""", """<b><a></a></b>"""),
("""<a></a><b></b>""", """<a></a><b></b>"""),
("""<a></a><b></b>""", """<a/><b/>"""),
# This is not correct result for an HTML parser, but it is for us
("""<b><a></a></b>""", """<b/><a/>"""),
]