diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index dac2e19d..768dc047 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -17,7 +17,6 @@
HTML parser implemented using Beautiful Soup and html.parser.
"""
-from io import BytesIO, StringIO
from warnings import filterwarnings
filterwarnings("ignore",
@@ -39,20 +38,11 @@ class Parser(object):
self.handler = handler
self.reset()
- def feed(self, feed_text):
- if not self.html_doc:
- if isinstance(feed_text, bytes):
- self.html_doc = BytesIO()
- else:
- self.html_doc = StringIO()
- self.html_doc.write(feed_text)
-
def feed_soup(self, soup):
self.soup = soup
def reset(self):
self.soup = None
- self.html_doc = None
def parse_contents(self, contents):
for content in contents:
@@ -75,9 +65,6 @@ class Parser(object):
self.handler.end_element(content.name)
def flush(self):
- if self.soup is None:
- self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
- multi_valued_attributes=None)
if hasattr(self.soup, 'contents'):
self.parse_contents(self.soup.contents)
self.encoding = self.soup.original_encoding
diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py
index e530791c..d54313f2 100644
--- a/linkcheck/htmlutil/formsearch.py
+++ b/linkcheck/htmlutil/formsearch.py
@@ -85,7 +85,7 @@ def search_form(content, cgiuser, cgipassword):
handler = FormFinder()
parser = htmlsax.parser(handler)
# parse
- parser.feed(content)
+ parser.feed_soup(htmlsax.make_soup(content))
parser.flush()
log.debug(LOG_CHECK, "Found forms %s", handler.forms)
cginames = (cgiuser.lower(), cgipassword.lower())
diff --git a/tests/htmllib.py b/tests/htmllib.py
index a42c4e6e..ab16ac46 100644
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@@ -15,50 +15,12 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
-Default HTML parser handler classes.
+HTML parser handler test class.
"""
import sys
-class HtmlPrinter:
- """
- Handles all functions by printing the function name and attributes.
- """
-
- def __init__ (self, fd=sys.stdout):
- """
- Write to given file descriptor.
-
- @param fd: file like object (default=sys.stdout)
- @type fd: file
- """
- self.fd = fd
-
- def _print (self, *attrs):
- """
- Print function attributes to stored file descriptor.
-
- @param attrs: list of values to print
- @type attrs: tuple
- @return: None
- """
- self.fd.write(self.mem)
- self.fd.write(str(attrs))
-
- def __getattr__ (self, name):
- """
- Remember the called method name in self.mem.
-
- @param name: attribute name
- @type name: string
- @return: method which just prints out its arguments
- @rtype: a bound function object
- """
- self.mem = name
- return self._print
-
-
class HtmlPrettyPrinter:
"""
Print out all parsed HTML data in encoded form.
diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py
index a6d1f9b4..9dcbb0c2 100644
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@@ -20,7 +20,7 @@ Test linkparser routines.
import unittest
from linkcheck.htmlutil import linkparse
-import linkcheck.HtmlParser.htmlsax
+from linkcheck.HtmlParser import htmlsax
class TestLinkparser (unittest.TestCase):
@@ -31,9 +31,9 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
- p = linkcheck.HtmlParser.htmlsax.parser(h)
+ p = htmlsax.parser(h)
try:
- p.feed(content)
+ p.feed_soup(htmlsax.make_soup(content))
p.flush()
except linkparse.StopParse:
pass
@@ -50,9 +50,9 @@ class TestLinkparser (unittest.TestCase):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
- p = linkcheck.HtmlParser.htmlsax.parser(h)
+ p = htmlsax.parser(h)
try:
- p.feed(content)
+ p.feed_soup(htmlsax.make_soup(content))
p.flush()
except linkparse.StopParse:
pass
diff --git a/tests/test_parser.py b/tests/test_parser.py
index fc831361..109f21fa 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -18,14 +18,14 @@
Test html parsing.
"""
-import linkcheck.HtmlParser.htmlsax
+from linkcheck.HtmlParser import htmlsax
from io import StringIO
import unittest
from parameterized import parameterized
-from .htmllib import HtmlPrinter, HtmlPrettyPrinter
+from .htmllib import HtmlPrettyPrinter
# list of tuples
# (, )
@@ -137,21 +137,14 @@ class TestParser (unittest.TestCase):
Test html parser.
"""
- def setUp (self):
- """
- Initialize two internal html parsers to be used for testing.
- """
- self.htmlparser = linkcheck.HtmlParser.htmlsax.parser()
- self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser()
-
@parameterized.expand(parsetests)
def test_parse (self, _in, _out):
# Parse all test patterns in one go.
out = StringIO()
handler = HtmlPrettyPrinter(out)
- self.htmlparser.handler = handler
- self.htmlparser.feed(_in)
- self.check_results(self.htmlparser, _in, _out, out)
+ parser = htmlsax.parser(handler)
+ parser.feed_soup(htmlsax.make_soup(_in))
+ self.check_results(_in, _out, out)
def check_results (self, htmlparser, _in, _out, out):
"""
@@ -164,44 +157,6 @@ class TestParser (unittest.TestCase):
self.assertEqual(res, _out, msg=msg)
htmlparser.reset()
- @parameterized.expand(parsetests)
- def test_feed (self, _in, _out):
- # Parse all test patterns sequentially.
- out = StringIO()
- handler = HtmlPrettyPrinter(out)
- self.htmlparser.handler = handler
- for c in _in:
- self.htmlparser.feed(c)
- self.check_results(self.htmlparser, _in, _out, out)
-
- @parameterized.expand(parsetests)
- def test_interwoven (self, _in, _out):
- # Parse all test patterns on two parsers interwoven.
- out = StringIO()
- out2 = StringIO()
- handler = HtmlPrettyPrinter(out)
- self.htmlparser.handler = handler
- handler2 = HtmlPrettyPrinter(out2)
- self.htmlparser2.handler = handler2
- for c in _in:
- self.htmlparser.feed(c)
- self.htmlparser2.feed(c)
- self.check_results(self.htmlparser, _in, _out, out)
- self.check_results(self.htmlparser2, _in, _out, out2)
-
- @parameterized.expand(parsetests)
- def test_handler (self, _in, _out):
- out = StringIO()
- out2 = StringIO()
- handler = HtmlPrinter(out)
- self.htmlparser.handler = handler
- handler2 = HtmlPrinter(out2)
- self.htmlparser2.handler = handler2
- for c in _in:
- self.htmlparser.feed(c)
- self.htmlparser2.feed(c)
- self.assertEqual(out.getvalue(), out2.getvalue())
-
def test_encoding_detection_utf_content (self):
html = b''
self.encoding_test(html, "utf-8")
@@ -227,11 +182,11 @@ class TestParser (unittest.TestCase):
self.encoding_test(html, "ascii")
def encoding_test (self, html, expected):
- parser = linkcheck.HtmlParser.htmlsax.parser()
+ parser = htmlsax.parser()
self.assertEqual(parser.encoding, None)
out = StringIO()
handler = HtmlPrettyPrinter(out)
- parser.handler = handler
- parser.feed(html)
+ parser = htmlsax.parser(handler)
+ parser.feed_soup(htmlsax.make_soup(html))
parser.flush()
self.assertEqual(parser.encoding, expected)