From 9d8d251d06f6251016fc743e8a9a960416c52d76 Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Wed, 8 Apr 2020 20:03:35 +0100
Subject: [PATCH 1/5] Replace Parser lineno() and column() methods

Stop storing this data in Parser object state.
---
 linkcheck/HtmlParser/htmlsax.py  | 13 +++---------
 linkcheck/checker/httpurl.py     |  4 ----
 linkcheck/htmlutil/formsearch.py | 13 +++---------
 linkcheck/htmlutil/linkparse.py  | 36 ++++++++++++++------------------
 linkcheck/parser/__init__.py     |  4 ----
 tests/htmllib.py                 |  6 +++---
 tests/test_linkparser.py         |  6 ------
 7 files changed, 25 insertions(+), 57 deletions(-)

diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index 7975b6e7..5a9f7282 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -49,22 +49,21 @@ class Parser(object):
     def reset(self):
         self.soup = None
         self.html_doc = None
-        self.tag_lineno = None
-        self.tag_column = None
 
     def parse_contents(self, contents):
         for content in contents:
             if isinstance(content, Tag):
-                self.tag_lineno = content.sourceline
-                self.tag_column = None if content.sourcepos is None \
+                tag_column = None if content.sourcepos is None \
                     else content.sourcepos + 1
                 if content.is_empty_element:
                     self.handler.start_end_element(
                         content.name, content.attrs, content.text.strip(),
+                        content.sourceline, tag_column
                     )
                 else:
                     self.handler.start_element(
                         content.name, content.attrs, content.text.strip(),
+                        content.sourceline, tag_column
                     )
                     if hasattr(content, 'contents'):  # recursion
                         self.parse_contents(content.contents)
@@ -79,12 +78,6 @@ class Parser(object):
             self.parse_contents(self.soup.contents)
         self.encoding = self.soup.original_encoding
 
-    def lineno(self):
-        return self.tag_lineno
-
-    def column(self):
-        return self.tag_column
-
 
 def parser(handler=None):
     return Parser(handler)
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 9e6459ef..d46e814b 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -83,7 +83,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         # construct parser object
         handler = linkparse.MetaRobotsFinder()
         parser = htmlsax.parser(handler)
-        handler.parser = parser
         # parse
         try:
             parser.feed_soup(self.get_soup())
@@ -91,9 +90,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         except linkparse.StopParse as msg:
             log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
             pass
-        # break cyclic dependencies
-        handler.parser = None
-        parser.handler = None
         return handler.follow
 
     def add_size_info (self):
diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py
index 9419a6c4..e530791c 100644
--- a/linkcheck/htmlutil/formsearch.py
+++ b/linkcheck/htmlutil/formsearch.py
@@ -44,13 +44,10 @@ class FormFinder(object):
     def __init__(self):
         """Initialize local variables."""
         super(FormFinder, self).__init__()
-        # parser object will be initialized when it is used as
-        # a handler object
-        self.parser = None
         self.forms = []
         self.form = None
 
-    def start_element(self, tag, attrs, element_text=None):
+    def start_element(self, tag, attrs, element_text, lineno, column):
         """Does nothing, override in a subclass."""
         if tag == u'form':
             if u'action' in attrs:
@@ -69,10 +66,10 @@ class FormFinder(object):
                 log.warn(LOG_CHECK, "formless input %s" % attrs)
                 pass
 
-    def start_end_element(self, tag, attrs, element_text=None):
+    def start_end_element(self, tag, attrs, element_text, lineno, column):
         """Delegate a combined start/end element (eg. <input .../>) to
         the start_element method. Ignore the end element part."""
-        self.start_element(tag, attrs, element_text)
+        self.start_element(tag, attrs, element_text, lineno, column)
 
     def end_element(self, tag):
         """search for ending form values."""
@@ -87,13 +84,9 @@ def search_form(content, cgiuser, cgipassword):
     """
     handler = FormFinder()
     parser = htmlsax.parser(handler)
-    handler.parser = parser
     # parse
     parser.feed(content)
     parser.flush()
-    # break cyclic dependencies
-    handler.parser = None
-    parser.handler = None
     log.debug(LOG_CHECK, "Found forms %s", handler.forms)
     cginames = (cgiuser.lower(), cgipassword.lower())
     for form in handler.forms:
diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py
index e5295817..b2ed61e6 100644
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@@ -104,18 +104,15 @@ class TagFinder (object):
     def __init__ (self):
         """Initialize local variables."""
         super(TagFinder, self).__init__()
-        # parser object will be initialized when it is used as
-        # a handler object
-        self.parser = None
 
-    def start_element (self, tag, attrs):
+    def start_element (self, tag, attrs, element_text, lineno, column):
         """Does nothing, override in a subclass."""
         pass
 
-    def start_end_element (self, tag, attrs, element_text=None):
+    def start_end_element (self, tag, attrs, element_text, lineno, column):
         """Delegate a combined start/end element (eg. <br/>) to
         the start_element method. Ignore the end element part."""
-        self.start_element(tag, attrs, element_text)
+        self.start_element(tag, attrs, element_text, lineno, column)
 
 
 class MetaRobotsFinder (TagFinder):
@@ -127,7 +124,7 @@ class MetaRobotsFinder (TagFinder):
         log.debug(LOG_CHECK, "meta robots finder")
         self.follow = self.index = True
 
-    def start_element (self, tag, attrs, element_text=None):
+    def start_element (self, tag, attrs, element_text, lineno, column):
         """Search for meta robots.txt "nofollow" and "noindex" flags."""
         if tag == 'meta' and attrs.get('name') == 'robots':
             val = attrs.get('content', u'').lower().split(u',')
@@ -177,10 +174,10 @@ class LinkFinder (TagFinder):
             self.tags[tag].update(self.universal_attrs)
         self.base_ref = u''
 
-    def start_element (self, tag, attrs, element_text=None):
+    def start_element (self, tag, attrs, element_text, lineno, column):
         """Search for links and store found URLs in a list."""
         log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
-        log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column())
+        log.debug(LOG_CHECK, "line %d col %d", lineno, column)
         if tag == "base" and not self.base_ref:
             self.base_ref = attrs.get("href", u'')
         tagattrs = self.tags.get(tag, self.universal_attrs)
@@ -205,7 +202,7 @@ class LinkFinder (TagFinder):
                     value = value.split(':', 1)[1]
                 value = 'dns:' + value.rstrip('/')
             # parse tag for URLs
-            self.parse_tag(tag, attr, value, name, base)
+            self.parse_tag(tag, attr, value, name, base, lineno, column)
         log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)
 
     def get_link_name (self, tag, attrs, attr, name=None):
@@ -221,7 +218,7 @@ class LinkFinder (TagFinder):
             name = u""
         return name
 
-    def parse_tag (self, tag, attr, value, name, base):
+    def parse_tag (self, tag, attr, value, name, base, lineno, column):
         """Add given url data to url list."""
         assert isinstance(tag, str_text), repr(tag)
         assert isinstance(attr, str_text), repr(attr)
@@ -232,25 +229,24 @@ class LinkFinder (TagFinder):
         if tag == u'meta' and value:
             mo = refresh_re.match(value)
             if mo:
-                self.found_url(mo.group("url"), name, base)
+                self.found_url(mo.group("url"), name, base, lineno, column)
             elif attr != 'content':
-                self.found_url(value, name, base)
+                self.found_url(value, name, base, lineno, column)
         elif attr == u'style' and value:
             for mo in css_url_re.finditer(value):
                 url = unquote(mo.group("url"), matching=True)
-                self.found_url(url, name, base)
+                self.found_url(url, name, base, lineno, column)
         elif attr == u'archive':
             for url in value.split(u','):
-                self.found_url(url, name, base)
+                self.found_url(url, name, base, lineno, column)
         elif attr == u'srcset':
             for img_candidate in value.split(u','):
                 url = img_candidate.split()[0]
-                self.found_url(url, name, base)
+                self.found_url(url, name, base, lineno, column)
         else:
-            self.found_url(value, name, base)
+            self.found_url(value, name, base, lineno, column)
 
-    def found_url(self, url, name, base):
+    def found_url(self, url, name, base, lineno, column):
         """Add newly found URL to queue."""
         assert isinstance(url, str_text) or url is None, repr(url)
-        self.callback(url, line=self.parser.lineno(),
-                      column=self.parser.column(), name=name, base=base)
+        self.callback(url, line=lineno, column=column, name=name, base=base)
diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py
index dc3494fb..c9bf471f 100644
--- a/linkcheck/parser/__init__.py
+++ b/linkcheck/parser/__init__.py
@@ -120,7 +120,6 @@ def find_links (url_data, callback, tags):
     # construct parser object
     handler = linkparse.LinkFinder(callback, tags)
     parser = htmlsax.parser(handler)
-    handler.parser = parser
     # parse
     try:
         soup = url_data.get_soup()
@@ -129,9 +128,6 @@ def find_links (url_data, callback, tags):
     except linkparse.StopParse as msg:
         log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
         pass
-    # break cyclic dependencies
-    handler.parser = None
-    parser.handler = None
 
 
 def parse_firefox (url_data):
diff --git a/tests/htmllib.py b/tests/htmllib.py
index 6f1c5b19..a42c4e6e 100644
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@@ -77,7 +77,7 @@ class HtmlPrettyPrinter:
         self.fd = fd
         self.encoding = encoding
 
-    def start_element (self, tag, attrs, element_text=None):
+    def start_element (self, tag, attrs, element_text, lineno, column):
         """
         Print HTML start element.
 
@@ -89,7 +89,7 @@ class HtmlPrettyPrinter:
         """
         self._start_element(tag, attrs, ">", element_text)
 
-    def start_end_element (self, tag, attrs, element_text=None):
+    def start_end_element (self, tag, attrs, element_text, lineno, column):
         """
         Print HTML start-end element.
 
@@ -101,7 +101,7 @@ class HtmlPrettyPrinter:
         """
         self._start_element(tag, attrs, "/>", element_text)
 
-    def _start_element (self, tag, attrs, end, element_text=None):
+    def _start_element (self, tag, attrs, end, element_text):
         """
         Print HTML element with end string.
 
diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py
index 0b965af2..a6d1f9b4 100644
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@@ -32,14 +32,11 @@ class TestLinkparser (unittest.TestCase):
         self.count_url = 0
         h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
         p = linkcheck.HtmlParser.htmlsax.parser(h)
-        h.parser = p
         try:
             p.feed(content)
             p.flush()
         except linkparse.StopParse:
             pass
-        h.parser = None
-        p.handler = None
         self.assertEqual(self.count_url, 1)
 
     def _test_one_url (self, origurl):
@@ -54,14 +51,11 @@ class TestLinkparser (unittest.TestCase):
             self.assertTrue(False, 'URL %r found' % url)
         h = linkparse.LinkFinder(callback, linkparse.LinkTags)
         p = linkcheck.HtmlParser.htmlsax.parser(h)
-        h.parser = p
         try:
             p.feed(content)
             p.flush()
         except linkparse.StopParse:
             pass
-        h.parser = None
-        p.handler = None
 
     def test_href_parsing (self):
         # Test <a href> parsing.

From 40f43ae41cfd03bbb8df566e4c275bee16b209b7 Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Wed, 8 Apr 2020 20:03:35 +0100
Subject: [PATCH 2/5] Create one function to make soup objects

---
 linkcheck/HtmlParser/htmlsax.py |  4 ++++
 linkcheck/checker/httpurl.py    |  5 +----
 linkcheck/checker/urlbase.py    | 11 ++---------
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index 5a9f7282..dac2e19d 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -27,6 +27,10 @@ filterwarnings("ignore",
 from bs4 import BeautifulSoup, Tag
 
 
+def make_soup(markup, from_encoding=None):
+    return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
+                         multi_valued_attributes=None)
+
 class Parser(object):
     handler = None
     encoding = None
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index d46e814b..53cce694 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -17,7 +17,6 @@
 """
 Handle http links.
 """
-from bs4 import BeautifulSoup
 import requests
 # The validity of SSL certs is ignored to be able
 # the check the URL and recurse into it.
@@ -305,9 +304,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
     def get_content(self):
         if self.text is None:
             self.get_raw_content()
-            self.soup = BeautifulSoup(self.data, "html.parser",
-                                      multi_valued_attributes=None,
-                                      from_encoding=self.encoding)
+            self.soup = htmlsax.make_soup(self.data, self.encoding)
             self.text = self.data.decode(self.soup.original_encoding)
         return self.text
 
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index ca924ad3..bb7debef 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -41,17 +41,11 @@ import select
 from io import BytesIO
 from builtins import str as str_text
 from future.utils import python_2_unicode_compatible
-from warnings import filterwarnings
-
-filterwarnings("ignore",
-    message="The soupsieve package is not installed. CSS selectors cannot be used.",
-    category=UserWarning, module="bs4")
-
-from bs4 import BeautifulSoup
 
 from . import absolute_url, get_url_from
 from .. import (log, LOG_CHECK,
   strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
+from ..HtmlParser import htmlsax
 from ..network import iputil
 from .const import (WARN_URL_EFFECTIVE_URL,
     WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@@ -657,8 +651,7 @@ class UrlBase (object):
     def get_content (self):
         if self.text is None:
             self.get_raw_content()
-            self.soup = BeautifulSoup(self.data, "html.parser",
-                                      multi_valued_attributes=None)
+            self.soup = htmlsax.make_soup(self.data)
             self.text = self.data.decode(self.soup.original_encoding)
             self.encoding = self.soup.original_encoding
         return self.text

From 3771dd913671b2fceb5b36b19342dacaa4bbf1ea Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Wed, 8 Apr 2020 20:03:35 +0100
Subject: [PATCH 3/5] Use parser.feed_soup() instead of parser.feed()

Markup is not being passed in pieces to the parser, so simplify the
interface and reduce the state further.
---
 linkcheck/HtmlParser/htmlsax.py  | 13 -------
 linkcheck/htmlutil/formsearch.py |  2 +-
 tests/htmllib.py                 | 40 +--------------------
 tests/test_linkparser.py         | 10 +++---
 tests/test_parser.py             | 61 +++++---------------------------
 5 files changed, 15 insertions(+), 111 deletions(-)

diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index dac2e19d..768dc047 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -17,7 +17,6 @@
 HTML parser implemented using Beautiful Soup and html.parser.
 """
 
-from io import BytesIO, StringIO
 from warnings import filterwarnings
 
 filterwarnings("ignore",
@@ -39,20 +38,11 @@ class Parser(object):
         self.handler = handler
         self.reset()
 
-    def feed(self, feed_text):
-        if not self.html_doc:
-            if isinstance(feed_text, bytes):
-                self.html_doc = BytesIO()
-            else:
-                self.html_doc = StringIO()
-        self.html_doc.write(feed_text)
-
     def feed_soup(self, soup):
         self.soup = soup
 
     def reset(self):
         self.soup = None
-        self.html_doc = None
 
     def parse_contents(self, contents):
         for content in contents:
@@ -75,9 +65,6 @@ class Parser(object):
                         self.handler.end_element(content.name)
 
     def flush(self):
-        if self.soup is None:
-            self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
-                                      multi_valued_attributes=None)
         if hasattr(self.soup, 'contents'):
             self.parse_contents(self.soup.contents)
         self.encoding = self.soup.original_encoding
diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py
index e530791c..d54313f2 100644
--- a/linkcheck/htmlutil/formsearch.py
+++ b/linkcheck/htmlutil/formsearch.py
@@ -85,7 +85,7 @@ def search_form(content, cgiuser, cgipassword):
     handler = FormFinder()
     parser = htmlsax.parser(handler)
     # parse
-    parser.feed(content)
+    parser.feed_soup(htmlsax.make_soup(content))
     parser.flush()
     log.debug(LOG_CHECK, "Found forms %s", handler.forms)
     cginames = (cgiuser.lower(), cgipassword.lower())
diff --git a/tests/htmllib.py b/tests/htmllib.py
index a42c4e6e..ab16ac46 100644
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@@ -15,50 +15,12 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
-Default HTML parser handler classes.
+HTML parser handler test class.
 """
 
 import sys
 
 
-class HtmlPrinter:
-    """
-    Handles all functions by printing the function name and attributes.
-    """
-
-    def __init__ (self, fd=sys.stdout):
-        """
-        Write to given file descriptor.
-
-        @param fd: file like object (default=sys.stdout)
-        @type fd: file
-        """
-        self.fd = fd
-
-    def _print (self, *attrs):
-        """
-        Print function attributes to stored file descriptor.
-
-        @param attrs: list of values to print
-        @type attrs: tuple
-        @return: None
-        """
-        self.fd.write(self.mem)
-        self.fd.write(str(attrs))
-
-    def __getattr__ (self, name):
-        """
-        Remember the called method name in self.mem.
-
-        @param name: attribute name
-        @type name: string
-        @return: method which just prints out its arguments
-        @rtype: a bound function object
-        """
-        self.mem = name
-        return self._print
-
-
 class HtmlPrettyPrinter:
     """
     Print out all parsed HTML data in encoded form.
diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py
index a6d1f9b4..9dcbb0c2 100644
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@@ -20,7 +20,7 @@ Test linkparser routines.
 
 import unittest
 from linkcheck.htmlutil import linkparse
-import linkcheck.HtmlParser.htmlsax
+from linkcheck.HtmlParser import htmlsax
 
 
 class TestLinkparser (unittest.TestCase):
@@ -31,9 +31,9 @@ class TestLinkparser (unittest.TestCase):
     def _test_one_link (self, content, url):
         self.count_url = 0
         h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
-        p = linkcheck.HtmlParser.htmlsax.parser(h)
+        p = htmlsax.parser(h)
         try:
-            p.feed(content)
+            p.feed_soup(htmlsax.make_soup(content))
             p.flush()
         except linkparse.StopParse:
             pass
@@ -50,9 +50,9 @@ class TestLinkparser (unittest.TestCase):
         def callback (url, line, column, name, base):
             self.assertTrue(False, 'URL %r found' % url)
         h = linkparse.LinkFinder(callback, linkparse.LinkTags)
-        p = linkcheck.HtmlParser.htmlsax.parser(h)
+        p = htmlsax.parser(h)
         try:
-            p.feed(content)
+            p.feed_soup(htmlsax.make_soup(content))
             p.flush()
         except linkparse.StopParse:
             pass
diff --git a/tests/test_parser.py b/tests/test_parser.py
index fc831361..109f21fa 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -18,14 +18,14 @@
 Test html parsing.
 """
 
-import linkcheck.HtmlParser.htmlsax
+from linkcheck.HtmlParser import htmlsax
 
 from io import StringIO
 import unittest
 
 from parameterized import parameterized
 
-from .htmllib import HtmlPrinter, HtmlPrettyPrinter
+from .htmllib import HtmlPrettyPrinter
 
 # list of tuples
 # (<test pattern>, <expected parse output>)
@@ -137,21 +137,14 @@ class TestParser (unittest.TestCase):
     Test html parser.
     """
 
-    def setUp (self):
-        """
-        Initialize two internal html parsers to be used for testing.
-        """
-        self.htmlparser = linkcheck.HtmlParser.htmlsax.parser()
-        self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser()
-
     @parameterized.expand(parsetests)
     def test_parse (self, _in, _out):
         # Parse all test patterns in one go.
         out = StringIO()
         handler = HtmlPrettyPrinter(out)
-        self.htmlparser.handler = handler
-        self.htmlparser.feed(_in)
-        self.check_results(self.htmlparser, _in, _out, out)
+        parser = htmlsax.parser(handler)
+        parser.feed_soup(htmlsax.make_soup(_in))
+        self.check_results(_in, _out, out)
 
     def check_results (self, htmlparser, _in, _out, out):
         """
@@ -164,44 +157,6 @@ class TestParser (unittest.TestCase):
         self.assertEqual(res, _out, msg=msg)
         htmlparser.reset()
 
-    @parameterized.expand(parsetests)
-    def test_feed (self, _in, _out):
-        # Parse all test patterns sequentially.
-        out = StringIO()
-        handler = HtmlPrettyPrinter(out)
-        self.htmlparser.handler = handler
-        for c in _in:
-            self.htmlparser.feed(c)
-        self.check_results(self.htmlparser, _in, _out, out)
-
-    @parameterized.expand(parsetests)
-    def test_interwoven (self, _in, _out):
-        # Parse all test patterns on two parsers interwoven.
-        out = StringIO()
-        out2 = StringIO()
-        handler = HtmlPrettyPrinter(out)
-        self.htmlparser.handler = handler
-        handler2 = HtmlPrettyPrinter(out2)
-        self.htmlparser2.handler = handler2
-        for c in _in:
-            self.htmlparser.feed(c)
-            self.htmlparser2.feed(c)
-        self.check_results(self.htmlparser, _in, _out, out)
-        self.check_results(self.htmlparser2, _in, _out, out2)
-
-    @parameterized.expand(parsetests)
-    def test_handler (self, _in, _out):
-        out = StringIO()
-        out2 = StringIO()
-        handler = HtmlPrinter(out)
-        self.htmlparser.handler = handler
-        handler2 = HtmlPrinter(out2)
-        self.htmlparser2.handler = handler2
-        for c in _in:
-            self.htmlparser.feed(c)
-            self.htmlparser2.feed(c)
-        self.assertEqual(out.getvalue(), out2.getvalue())
-
     def test_encoding_detection_utf_content (self):
         html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
         self.encoding_test(html, "utf-8")
@@ -227,11 +182,11 @@ class TestParser (unittest.TestCase):
         self.encoding_test(html, "ascii")
 
     def encoding_test (self, html, expected):
-        parser = linkcheck.HtmlParser.htmlsax.parser()
+        parser = htmlsax.parser()
         self.assertEqual(parser.encoding, None)
         out = StringIO()
         handler = HtmlPrettyPrinter(out)
-        parser.handler = handler
-        parser.feed(html)
+        parser = htmlsax.parser(handler)
+        parser.feed_soup(htmlsax.make_soup(html))
         parser.flush()
         self.assertEqual(parser.encoding, expected)

From 02e1c389b2c10ad3124fb2e2a8b021d41267772e Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Wed, 8 Apr 2020 20:03:35 +0100
Subject: [PATCH 4/5] Remove parser flush() and reset()

Remnants of the feed() interface.
---
 linkcheck/HtmlParser/htmlsax.py  | 12 ++----------
 linkcheck/checker/httpurl.py     |  1 -
 linkcheck/htmlutil/formsearch.py |  1 -
 linkcheck/parser/__init__.py     |  1 -
 tests/test_linkparser.py         |  2 --
 tests/test_parser.py             |  5 +----
 6 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index 768dc047..b4c6f460 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -36,13 +36,10 @@ class Parser(object):
 
     def __init__(self, handler):
         self.handler = handler
-        self.reset()
 
     def feed_soup(self, soup):
-        self.soup = soup
-
-    def reset(self):
-        self.soup = None
+        self.parse_contents(soup.contents)
+        self.encoding = soup.original_encoding
 
     def parse_contents(self, contents):
         for content in contents:
@@ -64,11 +61,6 @@ class Parser(object):
                     if hasattr(self.handler, 'end_element'):
                         self.handler.end_element(content.name)
 
-    def flush(self):
-        if hasattr(self.soup, 'contents'):
-            self.parse_contents(self.soup.contents)
-        self.encoding = self.soup.original_encoding
-
 
 def parser(handler=None):
     return Parser(handler)
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 53cce694..46dab657 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -85,7 +85,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         # parse
         try:
             parser.feed_soup(self.get_soup())
-            parser.flush()
         except linkparse.StopParse as msg:
             log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
             pass
diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py
index d54313f2..eca99ed6 100644
--- a/linkcheck/htmlutil/formsearch.py
+++ b/linkcheck/htmlutil/formsearch.py
@@ -86,7 +86,6 @@ def search_form(content, cgiuser, cgipassword):
     parser = htmlsax.parser(handler)
     # parse
     parser.feed_soup(htmlsax.make_soup(content))
-    parser.flush()
     log.debug(LOG_CHECK, "Found forms %s", handler.forms)
     cginames = (cgiuser.lower(), cgipassword.lower())
     for form in handler.forms:
diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py
index c9bf471f..b35892a8 100644
--- a/linkcheck/parser/__init__.py
+++ b/linkcheck/parser/__init__.py
@@ -124,7 +124,6 @@ def find_links (url_data, callback, tags):
     try:
         soup = url_data.get_soup()
         parser.feed_soup(soup)
-        parser.flush()
     except linkparse.StopParse as msg:
         log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
         pass
diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py
index 9dcbb0c2..e0962f7b 100644
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@@ -34,7 +34,6 @@ class TestLinkparser (unittest.TestCase):
         p = htmlsax.parser(h)
         try:
             p.feed_soup(htmlsax.make_soup(content))
-            p.flush()
         except linkparse.StopParse:
             pass
         self.assertEqual(self.count_url, 1)
@@ -53,7 +52,6 @@ class TestLinkparser (unittest.TestCase):
         p = htmlsax.parser(h)
         try:
             p.feed_soup(htmlsax.make_soup(content))
-            p.flush()
         except linkparse.StopParse:
             pass
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 109f21fa..42c048e1 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -146,16 +146,14 @@ class TestParser (unittest.TestCase):
         parser.feed_soup(htmlsax.make_soup(_in))
         self.check_results(_in, _out, out)
 
-    def check_results (self, htmlparser, _in, _out, out):
+    def check_results (self, _in, _out, out):
         """
         Check parse results.
         """
-        htmlparser.flush()
         res = out.getvalue()
         msg = "Test error; in: %r, out: %r, expect: %r" % \
            (_in, res, _out)
         self.assertEqual(res, _out, msg=msg)
-        htmlparser.reset()
 
     def test_encoding_detection_utf_content (self):
         html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
@@ -188,5 +186,4 @@ class TestParser (unittest.TestCase):
         handler = HtmlPrettyPrinter(out)
         parser = htmlsax.parser(handler)
         parser.feed_soup(htmlsax.make_soup(html))
-        parser.flush()
         self.assertEqual(parser.encoding, expected)

From 974915cc4f733fb314a8b8fd001e8abcf56ac9ef Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Wed, 8 Apr 2020 20:03:35 +0100
Subject: [PATCH 5/5] Remove encoding from Parser

Only used by the test and an attribute of the soup object.
---
 linkcheck/HtmlParser/htmlsax.py | 2 --
 tests/test_parser.py            | 7 +++----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index b4c6f460..df35d722 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -32,14 +32,12 @@ def make_soup(markup, from_encoding=None):
 
 class Parser(object):
     handler = None
-    encoding = None
 
     def __init__(self, handler):
         self.handler = handler
 
     def feed_soup(self, soup):
         self.parse_contents(soup.contents)
-        self.encoding = soup.original_encoding
 
     def parse_contents(self, contents):
         for content in contents:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 42c048e1..7e087082 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -180,10 +180,9 @@ class TestParser (unittest.TestCase):
         self.encoding_test(html, "ascii")
 
     def encoding_test (self, html, expected):
-        parser = htmlsax.parser()
-        self.assertEqual(parser.encoding, None)
         out = StringIO()
         handler = HtmlPrettyPrinter(out)
         parser = htmlsax.parser(handler)
-        parser.feed_soup(htmlsax.make_soup(html))
-        self.assertEqual(parser.encoding, expected)
+        soup = htmlsax.make_soup(html)
+        parser.feed_soup(soup)
+        self.assertEqual(soup.original_encoding, expected)