Merge pull request #351 from cjmayo/tagsonly

Remove support for non-Tag elements from Parser
This commit is contained in:
anarcat 2020-04-01 12:17:18 -04:00 committed by GitHub
commit cf4e6bb235
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 24 additions and 205 deletions

View file

@ -26,11 +26,6 @@ in the ListDict (ie. "<a href>" with lead to a {href: None} dict entry).
Used callbacks of a handler are:
- Comments: <!--data-->
def comment (data)
@param data:
@type data: Unicode string
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
def start_element (tag, attrs)
@param tag: tag name
@ -50,28 +45,6 @@ Used callbacks of a handler are:
@param tag: tag name
@type tag: Unicode string
- Document type: <!DOCTYPE data>
def doctype (data)
@param data: doctype string data
@type data: Unicode string
- Processing instruction (PI): <?name data?>
def pi (name, data=None)
@param name: instruction name
@type name: Unicode string
@param data: instruction data
@type data: Unicode string
- Character data: <![CDATA[data]]>
def cdata (data)
@param data: character data
@type data: Unicode string
- Characters: data
def characters(data): data
@param data: data
@type data: Unicode string
Additionally, there are error and warning callbacks:
- Parser warning.

View file

@ -24,8 +24,7 @@ filterwarnings("ignore",
message="The soupsieve package is not installed. CSS selectors cannot be used.",
category=UserWarning, module="bs4")
from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
Tag)
from bs4 import BeautifulSoup, Tag
from ..containers import ListDict
@ -87,28 +86,6 @@ class Parser(object):
self.parse_contents(content.contents)
if hasattr(self.handler, 'end_element'):
self.handler.end_element(content.name)
if content.comments:
for comment in content.comments:
if hasattr(self.handler, 'comment'):
self.handler.comment(comment)
elif isinstance(content, Doctype):
if hasattr(self.handler, 'doctype'):
self.handler.doctype(
content[len('DOCTYPE '):]
if content.upper().startswith('DOCTYPE ')
else content)
elif isinstance(content, Comment):
if hasattr(self.handler, 'comment'):
self.handler.comment(content.strip())
elif isinstance(content, CData):
if hasattr(self.handler, 'cdata'):
self.handler.cdata(content)
elif isinstance(content, ProcessingInstruction):
if hasattr(self.handler, 'pi'):
self.handler.pi(content.strip("? "))
else:
if hasattr(self.handler, 'characters'):
self.handler.characters(content)
def flush(self):
if self.soup is None:

View file

@ -19,11 +19,9 @@ Default HTML parser handler classes.
"""
import sys
from builtins import bytes, str as str_text
from builtins import chr
class HtmlPrinter (object):
class HtmlPrinter:
"""
Handles all functions by printing the function name and attributes.
"""
@ -46,7 +44,7 @@ class HtmlPrinter (object):
@return: None
"""
self.fd.write(self.mem)
self.fd.write(str_text(attrs))
self.fd.write(str(attrs))
def __getattr__ (self, name):
"""
@ -61,7 +59,7 @@ class HtmlPrinter (object):
return self._print
class HtmlPrettyPrinter (object):
class HtmlPrettyPrinter:
"""
Print out all parsed HTML data in encoded form.
Also stores error and warnings messages.
@ -79,16 +77,6 @@ class HtmlPrettyPrinter (object):
self.fd = fd
self.encoding = encoding
def comment (self, data):
"""
Print HTML comment.
@param data: the comment
@type data: string
@return: None
"""
self.fd.write("<!-- %s -->" % data)
def start_element (self, tag, attrs, element_text=None):
"""
Print HTML start element.
@ -99,7 +87,7 @@ class HtmlPrettyPrinter (object):
@type attrs: dict
@return: None
"""
self._start_element(tag, attrs, u">")
self._start_element(tag, attrs, ">", element_text)
def start_end_element (self, tag, attrs, element_text=None):
"""
@ -111,9 +99,9 @@ class HtmlPrettyPrinter (object):
@type attrs: dict
@return: None
"""
self._start_element(tag, attrs, u"/>")
self._start_element(tag, attrs, "/>", element_text)
def _start_element (self, tag, attrs, end):
def _start_element (self, tag, attrs, end, element_text=None):
"""
Print HTML element with end string.
@ -125,13 +113,15 @@ class HtmlPrettyPrinter (object):
@type end: string
@return: None
"""
self.fd.write(u"<%s" % tag.replace("/", ""))
self.fd.write("<%s" % tag.replace("/", ""))
for key, val in attrs.items():
if val is None:
self.fd.write(u" %s" % key)
self.fd.write(" %s" % key)
else:
self.fd.write(u' %s="%s"' % (key, quote_attrval(val)))
self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
self.fd.write(end)
if element_text:
self.fd.write(element_text)
def end_element (self, tag):
"""
@ -143,46 +133,6 @@ class HtmlPrettyPrinter (object):
"""
self.fd.write("</%s>" % tag)
def doctype (self, data):
"""
Print HTML document type.
@param data: the document type
@type data: string
@return: None
"""
self.fd.write("<!DOCTYPE %s>" % data)
def pi (self, data):
"""
Print HTML pi.
@param data: the tag data
@type data: string
@return: None
"""
self.fd.write("<?%s?>" % data)
def cdata (self, data):
"""
Print HTML cdata.
@param data: the character data
@type data: string
@return: None
"""
self.fd.write("<![CDATA[%s]]>" % data)
def characters (self, data):
"""
Print characters.
@param data: the character data
@type data: string
@return: None
"""
self.fd.write(data)
def quote_attrval (s):
"""
@ -195,18 +145,14 @@ def quote_attrval (s):
"""
res = []
for c in s:
try: # Python 2
ord_c = ord(c)
except TypeError:
ord_c = c
if ord_c <= 127:
if ord(c) <= 127:
# ASCII
if c == u'&':
res.append(u"&amp;")
elif c == u'"':
res.append(u"&quot;")
if c == '&':
res.append("&amp;")
elif c == '"':
res.append("&quot;")
else:
res.append(chr(ord_c))
res.append(c)
else:
res.append(u"&#%d;" % ord_c)
return u"".join(res)
res.append("&#%d;" % ord(c))
return "".join(res)

View file

@ -20,10 +20,7 @@ Test html parsing.
import linkcheck.HtmlParser.htmlsax
try:
from cStringIO import StringIO
except ImportError:
from io import StringIO
from io import StringIO
import unittest
from parameterized import parameterized
@ -38,7 +35,6 @@ parsetests = [
("""<a b='c' >""", """<a b="c"></a>"""),
("""<a b=c" >""", """<a b="c&quot;"></a>"""),
("""<a b=c' >""", """<a b="c'"></a>"""),
("""<a b="c >""", """<a b="c >"""),
("""<a b="" >""", """<a b=""></a>"""),
("""<a b='' >""", """<a b=""></a>"""),
("""<a b=>""", """<a b=""></a>"""),
@ -51,10 +47,8 @@ parsetests = [
("""<a b ="c" >""", """<a b="c"></a>"""),
("""<a b = "c" >""", """<a b="c"></a>"""),
("""<a >""", """<a></a>"""),
("""< a>""", """< a>"""),
("""< a >""", """< a >"""),
("""<>""", """<>"""),
("""< >""", """< >"""),
("""<>""", """"""),
("""< >""", """"""),
("""<aä>""", u"""<aä></aä>"""),
("""<a aä="b">""", u"""<a aä="b"></a>"""),
("""<a a="">""", u"""<a a="b&#228;"></a>"""),
@ -64,7 +58,6 @@ parsetests = [
("""<a b="c" b="d" >""", """<a b="d"></a>"""),
# reduce test
("""<a b="c"><""", """<a b="c"><</a>"""),
("""d>""", """d>"""),
# numbers in tag
("""<h1>bla</h1>""", """<h1>bla</h1>"""),
# more start tags
@ -72,49 +65,10 @@ parsetests = [
("""<a b=/c/></a><br>""", """<a b="/c/"></a><br/>"""),
("""<br/>""", """<br/>"""),
("""<a b="50%"><br>""", """<a b="50%"><br/></a>"""),
# comments
("""<!---->< 1>""", """<!-- -->< 1>"""),
("""<!-- a - b -->< 2>""", """<!-- a - b -->< 2>"""),
("""<!----->< 3>""", """<!-- - -->< 3>"""),
("""<!------>< 4>""", """<!-- -- -->< 4>"""),
("""<!------->< 5>""", """<!-- --- -->< 5>"""),
("""<!-- -->< 7>""", """<!-- -->< 7>"""),
("""<!---- />-->""", """<!-- -- /> -->"""),
("""<!-- a-2 -->< 9>""", """<!-- a-2 -->< 9>"""),
("""<!-- --- -->< 10>""", """<!-- --- -->< 10>"""),
("""<!>""", """<!-- -->"""), # empty comment
# invalid comments
("""<!-- -- >< 8>""", """<!-- -->< 8>"""),
("""<!---- >< 6>""", """<!-- -->< 6>"""),
("""<!- blubb ->""", """<!-- - blubb - -->"""),
("""<! -- blubb -->""", """<!-- -- blubb -- -->"""),
("""<!-- blubb -- >""", """<!-- blubb -->"""),
("""<! blubb !>< a>""", """<!-- blubb ! -->< a>"""),
("""<! blubb >< a>""", """<!-- blubb -->< a>"""),
# end tags
("""</a>""", """"""),
("""</ a>""", """"""),
("""</ a >""", """"""),
("""</a >""", """"""),
("""< / a>""", """< / a>"""),
("""< /a>""", """< /a>"""),
("""</aä>""", """"""),
# start and end tag (HTML doctype assumed)
("""<a/>""", """<a></a>"""),
("""<meta/>""", """<meta/>"""),
("""<MetA/>""", """<meta/>"""),
# declaration tags
("""<!DOCtype adrbook SYSTEM "adrbook.dtd">""",
"""<!DOCTYPE adrbook SYSTEM "adrbook.dtd">"""),
# misc
("""<?xmL version="1.0" encoding="latin1"?>""",
"""<?xmL version="1.0" encoding="latin1"?>"""),
# javascript
("""<script >\n</script>""", """<script>\n</script>"""),
("""<sCrIpt lang="a">bla </a> fasel</scripT>""",
"""<script lang="a">bla </a> fasel</script>"""),
("""<script ><!--bla//-->// </script >""",
"""<script><!--bla//-->// </script>"""),
# line continuation (Dr. Fun webpage)
("""<img bo\\\nrder=0 >""", """<img bo\\="" rder="0"/>"""),
("""<img align="mid\\\ndle">""", """<img align="mid\\\ndle"/>"""),
@ -144,15 +98,9 @@ parsetests = [
("""<a href="&#x6D;ailto:" >""", """<a href="mailto:"></a>"""),
# note that \u8156 is not valid encoding and therefore gets removed
("""<a href="&#8156;ailto:" >""", """<a href="&#8156;ailto:"></a>"""),
# non-ascii characters
("""<Üzgür> fahr </langsamer> żżżżżż{""",
u"""<Üzgür> fahr żżżżżż{"""),
# mailto link
("""<a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>""",
"""<a href="mailto:calvin@LocalHost?subject=Hallo&amp;to=michi">1</a>"""),
# doctype XHTML
("""<!DOCTYPe html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><MeTa a="b"/>""",
"""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><meta a="b"/>"""),
# meta tag with charset encoding
("""<meta http-equiv="content-type" content>""",
"""<meta content="" http-equiv="content-type"/>"""),
@ -164,22 +112,13 @@ parsetests = [
"""<meta content="text/html; charset=iso8859-1" http-equiv="content-type"/>"""),
("""<meta http-equiv="content-type" content="text/html; charset=hulla">""",
"""<meta content="text/html; charset=hulla" http-equiv="content-type"/>"""),
# CDATA
("""<![CDATA[<a>hallo</a>]]>""", """<![CDATA[<a>hallo</a>]]>"""),
# missing > in end tag
("""</td <td a="b" >""", """"""),
("""</td<td a="b" >""", """"""),
# missing beginning quote
("""<td a=b">""", """<td a="b&quot;"></td>"""),
# stray < before start tag
("""<0.<td a="b" >""", """<0.<td a="b"></td>"""),
# stray < before end tag
("""<0.</td >""", """<0."""),
# missing end quote (XXX TODO)
#("""<td a="b>\n""", """<td a="b">\n"""),
#("""<td a="b></td>\na""", """<td a="b"></td>\na"""),
#("""<a b="c><a b="c>\n""", """<a b="c"><a b="c">\n"""),
#("""<td a="b c="d"></td>\n""", """<td a="b" c="d"></td>\n"""),
("""<0.<td a="b" >""", """<td a="b"></td>"""),
# HTML5 tags
("""<audio src=bla>""", """<audio src="bla"></audio>"""),
("""<button formaction=bla>""", """<button formaction="bla"></button>"""),
@ -192,13 +131,6 @@ parsetests = [
("""<a></a><b></b>""", """<a></a><b></b>"""),
]
flushtests = [
("<", "<"),
("<a", "<a"),
("<!a", "<!a"),
("<?a", "<?a"),
]
class TestParser (unittest.TestCase):
"""
@ -270,15 +202,6 @@ class TestParser (unittest.TestCase):
self.htmlparser2.feed(c)
self.assertEqual(out.getvalue(), out2.getvalue())
@parameterized.expand(flushtests)
def test_flush (self, _in, _out):
# Test parser flushing.
out = StringIO()
handler = HtmlPrettyPrinter(out)
self.htmlparser.handler = handler
self.htmlparser.feed(_in)
self.check_results(self.htmlparser, _in, _out, out)
def test_encoding_detection_utf_content (self):
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
self.encoding_test(html, "utf-8")