diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index dac2e19d..768dc047 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -17,7 +17,6 @@ HTML parser implemented using Beautiful Soup and html.parser. """ -from io import BytesIO, StringIO from warnings import filterwarnings filterwarnings("ignore", @@ -39,20 +38,11 @@ class Parser(object): self.handler = handler self.reset() - def feed(self, feed_text): - if not self.html_doc: - if isinstance(feed_text, bytes): - self.html_doc = BytesIO() - else: - self.html_doc = StringIO() - self.html_doc.write(feed_text) - def feed_soup(self, soup): self.soup = soup def reset(self): self.soup = None - self.html_doc = None def parse_contents(self, contents): for content in contents: @@ -75,9 +65,6 @@ class Parser(object): self.handler.end_element(content.name) def flush(self): - if self.soup is None: - self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser', - multi_valued_attributes=None) if hasattr(self.soup, 'contents'): self.parse_contents(self.soup.contents) self.encoding = self.soup.original_encoding diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py index e530791c..d54313f2 100644 --- a/linkcheck/htmlutil/formsearch.py +++ b/linkcheck/htmlutil/formsearch.py @@ -85,7 +85,7 @@ def search_form(content, cgiuser, cgipassword): handler = FormFinder() parser = htmlsax.parser(handler) # parse - parser.feed(content) + parser.feed_soup(htmlsax.make_soup(content)) parser.flush() log.debug(LOG_CHECK, "Found forms %s", handler.forms) cginames = (cgiuser.lower(), cgipassword.lower()) diff --git a/tests/htmllib.py b/tests/htmllib.py index a42c4e6e..ab16ac46 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -15,50 +15,12 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ -Default HTML parser handler classes. +HTML parser handler test class. """ import sys -class HtmlPrinter: - """ - Handles all functions by printing the function name and attributes. - """ - - def __init__ (self, fd=sys.stdout): - """ - Write to given file descriptor. - - @param fd: file like object (default=sys.stdout) - @type fd: file - """ - self.fd = fd - - def _print (self, *attrs): - """ - Print function attributes to stored file descriptor. - - @param attrs: list of values to print - @type attrs: tuple - @return: None - """ - self.fd.write(self.mem) - self.fd.write(str(attrs)) - - def __getattr__ (self, name): - """ - Remember the called method name in self.mem. - - @param name: attribute name - @type name: string - @return: method which just prints out its arguments - @rtype: a bound function object - """ - self.mem = name - return self._print - - class HtmlPrettyPrinter: """ Print out all parsed HTML data in encoded form. diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index a6d1f9b4..9dcbb0c2 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -20,7 +20,7 @@ Test linkparser routines. import unittest from linkcheck.htmlutil import linkparse -import linkcheck.HtmlParser.htmlsax +from linkcheck.HtmlParser import htmlsax class TestLinkparser (unittest.TestCase): @@ -31,9 +31,9 @@ class TestLinkparser (unittest.TestCase): def _test_one_link (self, content, url): self.count_url = 0 h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags) - p = linkcheck.HtmlParser.htmlsax.parser(h) + p = htmlsax.parser(h) try: - p.feed(content) + p.feed_soup(htmlsax.make_soup(content)) p.flush() except linkparse.StopParse: pass @@ -50,9 +50,9 @@ class TestLinkparser (unittest.TestCase): def callback (url, line, column, name, base): self.assertTrue(False, 'URL %r found' % url) h = linkparse.LinkFinder(callback, linkparse.LinkTags) - p = linkcheck.HtmlParser.htmlsax.parser(h) + p = htmlsax.parser(h) try: - p.feed(content) + p.feed_soup(htmlsax.make_soup(content)) p.flush() except linkparse.StopParse: pass diff --git a/tests/test_parser.py b/tests/test_parser.py index fc831361..109f21fa 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,14 +18,14 @@ Test html parsing. """ -import linkcheck.HtmlParser.htmlsax +from linkcheck.HtmlParser import htmlsax from io import StringIO import unittest from parameterized import parameterized -from .htmllib import HtmlPrinter, HtmlPrettyPrinter +from .htmllib import HtmlPrettyPrinter # list of tuples # (, ) @@ -137,21 +137,14 @@ class TestParser (unittest.TestCase): Test html parser. """ - def setUp (self): - """ - Initialize two internal html parsers to be used for testing. - """ - self.htmlparser = linkcheck.HtmlParser.htmlsax.parser() - self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser() - @parameterized.expand(parsetests) def test_parse (self, _in, _out): # Parse all test patterns in one go. out = StringIO() handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - self.htmlparser.feed(_in) - self.check_results(self.htmlparser, _in, _out, out) + parser = htmlsax.parser(handler) + parser.feed_soup(htmlsax.make_soup(_in)) + self.check_results(_in, _out, out) def check_results (self, htmlparser, _in, _out, out): """ @@ -164,44 +157,6 @@ class TestParser (unittest.TestCase): self.assertEqual(res, _out, msg=msg) htmlparser.reset() - @parameterized.expand(parsetests) - def test_feed (self, _in, _out): - # Parse all test patterns sequentially. - out = StringIO() - handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - for c in _in: - self.htmlparser.feed(c) - self.check_results(self.htmlparser, _in, _out, out) - - @parameterized.expand(parsetests) - def test_interwoven (self, _in, _out): - # Parse all test patterns on two parsers interwoven. - out = StringIO() - out2 = StringIO() - handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - handler2 = HtmlPrettyPrinter(out2) - self.htmlparser2.handler = handler2 - for c in _in: - self.htmlparser.feed(c) - self.htmlparser2.feed(c) - self.check_results(self.htmlparser, _in, _out, out) - self.check_results(self.htmlparser2, _in, _out, out2) - - @parameterized.expand(parsetests) - def test_handler (self, _in, _out): - out = StringIO() - out2 = StringIO() - handler = HtmlPrinter(out) - self.htmlparser.handler = handler - handler2 = HtmlPrinter(out2) - self.htmlparser2.handler = handler2 - for c in _in: - self.htmlparser.feed(c) - self.htmlparser2.feed(c) - self.assertEqual(out.getvalue(), out2.getvalue()) - def test_encoding_detection_utf_content (self): html = b'' self.encoding_test(html, "utf-8") @@ -227,11 +182,11 @@ class TestParser (unittest.TestCase): self.encoding_test(html, "ascii") def encoding_test (self, html, expected): - parser = linkcheck.HtmlParser.htmlsax.parser() + parser = htmlsax.parser() self.assertEqual(parser.encoding, None) out = StringIO() handler = HtmlPrettyPrinter(out) - parser.handler = handler - parser.feed(html) + parser = htmlsax.parser(handler) + parser.feed_soup(htmlsax.make_soup(html)) parser.flush() self.assertEqual(parser.encoding, expected)