Use parser.feed_soup() instead of parser.feed()

Markup is not being passed in pieces to the parser, so simplify the
interface and reduce the state further.
This commit is contained in:
Chris Mayo 2020-04-08 20:03:35 +01:00
parent 40f43ae41c
commit 3771dd9136
5 changed files with 15 additions and 111 deletions

View file

@ -17,7 +17,6 @@
HTML parser implemented using Beautiful Soup and html.parser.
"""
from io import BytesIO, StringIO
from warnings import filterwarnings
filterwarnings("ignore",
@ -39,20 +38,11 @@ class Parser(object):
self.handler = handler
self.reset()
def feed(self, feed_text):
if not self.html_doc:
if isinstance(feed_text, bytes):
self.html_doc = BytesIO()
else:
self.html_doc = StringIO()
self.html_doc.write(feed_text)
def feed_soup(self, soup):
self.soup = soup
def reset(self):
self.soup = None
self.html_doc = None
def parse_contents(self, contents):
for content in contents:
@ -75,9 +65,6 @@ class Parser(object):
self.handler.end_element(content.name)
def flush(self):
if self.soup is None:
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
multi_valued_attributes=None)
if hasattr(self.soup, 'contents'):
self.parse_contents(self.soup.contents)
self.encoding = self.soup.original_encoding

View file

@ -85,7 +85,7 @@ def search_form(content, cgiuser, cgipassword):
handler = FormFinder()
parser = htmlsax.parser(handler)
# parse
parser.feed(content)
parser.feed_soup(htmlsax.make_soup(content))
parser.flush()
log.debug(LOG_CHECK, "Found forms %s", handler.forms)
cginames = (cgiuser.lower(), cgipassword.lower())

View file

@ -15,50 +15,12 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Default HTML parser handler classes.
HTML parser handler test class.
"""
import sys
class HtmlPrinter:
"""
Handles all functions by printing the function name and attributes.
"""
def __init__ (self, fd=sys.stdout):
"""
Write to given file descriptor.
@param fd: file like object (default=sys.stdout)
@type fd: file
"""
self.fd = fd
def _print (self, *attrs):
"""
Print function attributes to stored file descriptor.
@param attrs: list of values to print
@type attrs: tuple
@return: None
"""
self.fd.write(self.mem)
self.fd.write(str(attrs))
def __getattr__ (self, name):
"""
Remember the called method name in self.mem.
@param name: attribute name
@type name: string
@return: method which just prints out its arguments
@rtype: a bound function object
"""
self.mem = name
return self._print
class HtmlPrettyPrinter:
"""
Print out all parsed HTML data in encoded form.

View file

@ -20,7 +20,7 @@ Test linkparser routines.
import unittest
from linkcheck.htmlutil import linkparse
import linkcheck.HtmlParser.htmlsax
from linkcheck.HtmlParser import htmlsax
class TestLinkparser (unittest.TestCase):
@ -31,9 +31,9 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
p = linkcheck.HtmlParser.htmlsax.parser(h)
p = htmlsax.parser(h)
try:
p.feed(content)
p.feed_soup(htmlsax.make_soup(content))
p.flush()
except linkparse.StopParse:
pass
@ -50,9 +50,9 @@ class TestLinkparser (unittest.TestCase):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
p = linkcheck.HtmlParser.htmlsax.parser(h)
p = htmlsax.parser(h)
try:
p.feed(content)
p.feed_soup(htmlsax.make_soup(content))
p.flush()
except linkparse.StopParse:
pass

View file

@ -18,14 +18,14 @@
Test html parsing.
"""
import linkcheck.HtmlParser.htmlsax
from linkcheck.HtmlParser import htmlsax
from io import StringIO
import unittest
from parameterized import parameterized
from .htmllib import HtmlPrinter, HtmlPrettyPrinter
from .htmllib import HtmlPrettyPrinter
# list of tuples
# (<test pattern>, <expected parse output>)
@ -137,21 +137,14 @@ class TestParser (unittest.TestCase):
Test html parser.
"""
def setUp (self):
"""
Initialize two internal html parsers to be used for testing.
"""
self.htmlparser = linkcheck.HtmlParser.htmlsax.parser()
self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser()
@parameterized.expand(parsetests)
def test_parse (self, _in, _out):
# Parse all test patterns in one go.
out = StringIO()
handler = HtmlPrettyPrinter(out)
self.htmlparser.handler = handler
self.htmlparser.feed(_in)
self.check_results(self.htmlparser, _in, _out, out)
parser = htmlsax.parser(handler)
parser.feed_soup(htmlsax.make_soup(_in))
self.check_results(_in, _out, out)
def check_results (self, htmlparser, _in, _out, out):
"""
@ -164,44 +157,6 @@ class TestParser (unittest.TestCase):
self.assertEqual(res, _out, msg=msg)
htmlparser.reset()
@parameterized.expand(parsetests)
def test_feed (self, _in, _out):
# Parse all test patterns sequentially.
out = StringIO()
handler = HtmlPrettyPrinter(out)
self.htmlparser.handler = handler
for c in _in:
self.htmlparser.feed(c)
self.check_results(self.htmlparser, _in, _out, out)
@parameterized.expand(parsetests)
def test_interwoven (self, _in, _out):
# Parse all test patterns on two parsers interwoven.
out = StringIO()
out2 = StringIO()
handler = HtmlPrettyPrinter(out)
self.htmlparser.handler = handler
handler2 = HtmlPrettyPrinter(out2)
self.htmlparser2.handler = handler2
for c in _in:
self.htmlparser.feed(c)
self.htmlparser2.feed(c)
self.check_results(self.htmlparser, _in, _out, out)
self.check_results(self.htmlparser2, _in, _out, out2)
@parameterized.expand(parsetests)
def test_handler (self, _in, _out):
out = StringIO()
out2 = StringIO()
handler = HtmlPrinter(out)
self.htmlparser.handler = handler
handler2 = HtmlPrinter(out2)
self.htmlparser2.handler = handler2
for c in _in:
self.htmlparser.feed(c)
self.htmlparser2.feed(c)
self.assertEqual(out.getvalue(), out2.getvalue())
def test_encoding_detection_utf_content (self):
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
self.encoding_test(html, "utf-8")
@ -227,11 +182,11 @@ class TestParser (unittest.TestCase):
self.encoding_test(html, "ascii")
def encoding_test (self, html, expected):
parser = linkcheck.HtmlParser.htmlsax.parser()
parser = htmlsax.parser()
self.assertEqual(parser.encoding, None)
out = StringIO()
handler = HtmlPrettyPrinter(out)
parser.handler = handler
parser.feed(html)
parser = htmlsax.parser(handler)
parser.feed_soup(htmlsax.make_soup(html))
parser.flush()
self.assertEqual(parser.encoding, expected)