Stop using HTML handlers

LinkFinder is the only remaining HTML handler therefore no need for htmlsoup.process_soup() as an independent function or TagFinder as a base class.
2026-04-21 14:44:44 +00:00 · 2020-04-29 20:07:00 +01:00 · 2020-04-29 20:07:00 +01:00 · 9eed070a73
commit 9eed070a73
parent a1433767e5
5 changed files with 23 additions and 69 deletions
--- a/linkcheck/htmlutil/htmlsoup.py
+++ b/linkcheck/htmlutil/htmlsoup.py
@ -15,35 +15,6 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
 HTML parser implemented using Beautiful Soup and html.parser.
-
-USAGE
-
-Two functions are provided, one to make a BeautifulSoup object from markup and
-another to call a handler's callbacks for each element in a BeautifulSoup
-object it can process.
-
-The used callback of a handler is:
-
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
-  def start_element (tag, attrs, text, line, column)
-  @param tag: tag name
-  @type tag: string
-  @param attrs: tag attributes
-  @type attrs: dict
-  @param text: element text
-  @type tag: string
-  @param line: tag line number
-  @type tag: integer
-  @param column: tag column number
-  @type tag: integer
-
-EXAMPLE
-
- # Create a new BeautifulSoup object.
- soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
- # Process the soup with the chosen handler as a parameter.
- htmlutil.htmlsoup.proces_soup(handler, soup)
-
 """

 from warnings import filterwarnings
@ -58,10 +29,3 @@ from bs4 import BeautifulSoup
 def make_soup(markup, from_encoding=None):
    return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
                         multi_valued_attributes=None)
-
-def process_soup(handler, soup):
-    for element in soup.find_all(True):
-        handler.start_element(
-            element.name, element.attrs, element.text.strip(),
-            element.sourceline,
-            None if element.sourcepos is None else element.sourcepos + 1)
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -98,19 +98,6 @@ def strip_c_comments (text):
    return c_comment_re.sub('', text)


-class TagFinder (object):
-    """Base class handling HTML start elements.
-    TagFinder instances are used as HTML parser handlers."""
-
-    def __init__ (self):
-        """Initialize local variables."""
-        super(TagFinder, self).__init__()
-
-    def start_element (self, tag, attrs, element_text, lineno, column):
-        """Does nothing, override in a subclass."""
-        pass
-
-
 def is_meta_url (attr, attrs):
    """Check if the meta attributes contain a URL."""
    res = False
@ -133,13 +120,12 @@ def is_form_get(attr, attrs):
    return res


-class LinkFinder (TagFinder):
+class LinkFinder:
    """Find HTML links, and apply them to the callback function with the
    format (url, lineno, column, name, codebase)."""

    def __init__ (self, callback, tags):
        """Store content in buffer and initialize URL list."""
-        super(LinkFinder, self).__init__()
        self.callback = callback
        # set universal tag attributes using tagname None
        self.universal_attrs = set(tags.get(None, []))
@ -150,7 +136,7 @@ class LinkFinder (TagFinder):
            self.tags[tag].update(self.universal_attrs)
        self.base_ref = u''

-    def start_element (self, tag, attrs, element_text, lineno, column):
+    def html_element (self, tag, attrs, element_text, lineno, column):
        """Search for links and store found URLs in a list."""
        log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
        log.debug(LOG_CHECK, "line %d col %d", lineno, column)
@ -226,3 +212,15 @@ class LinkFinder (TagFinder):
        """Add newly found URL to queue."""
        assert isinstance(url, str_text) or url is None, repr(url)
        self.callback(url, line=lineno, column=column, name=name, base=base)
+
+
+def find_links(soup, callback, tags):
+    """Parse into content and search for URLs to check.
+    When a URL is found it is passed to the supplied callback.
+    """
+    lf = LinkFinder(callback, tags)
+    for element in soup.find_all(True):
+        lf.html_element(
+            element.name, element.attrs, element.text.strip(),
+            element.sourceline,
+            None if element.sourcepos is None else element.sourcepos + 1)
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -18,7 +18,7 @@
 Main functions for link parsing
 """
 from .. import log, LOG_CHECK, strformat, url as urlutil
-from ..htmlutil import htmlsoup, linkparse
+from ..htmlutil import linkparse
 from ..bookmarks import firefox


@ -46,7 +46,7 @@ def parse_html (url_data):
    """Parse into HTML content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
-    find_links(url_data, url_data.add_url, linkparse.LinkTags)
+    linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.LinkTags)


 def parse_opera (url_data):
@ -112,15 +112,7 @@ def parse_wml (url_data):
    """Parse into WML content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
-    find_links(url_data, url_data.add_url, linkparse.WmlTags)
-
-
-def find_links (url_data, callback, tags):
-    """Parse into content and search for URLs to check.
-    Found URLs are added to the URL queue.
-    """
-    handler = linkparse.LinkFinder(callback, tags)
-    htmlsoup.process_soup(handler, url_data.get_soup())
+    linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.WmlTags)


 def parse_firefox (url_data):
--- a/linkcheck/plugins/anchorcheck.py
+++ b/linkcheck/plugins/anchorcheck.py
@ -22,7 +22,6 @@ from urllib import parse
 from . import _ContentPlugin
 from .. import log, LOG_PLUGIN
 from ..htmlutil import linkparse
-from ..parser import find_links


 class AnchorCheck(_ContentPlugin):
@ -37,7 +36,8 @@ class AnchorCheck(_ContentPlugin):
        log.debug(LOG_PLUGIN, "checking content for invalid anchors")
        # list of parsed anchors
        self.anchors = []
-        find_links(url_data, self.add_anchor, linkparse.AnchorTags)
+        linkparse.find_links(url_data.get_soup(), self.add_anchor,
+                             linkparse.AnchorTags)
        self.check_anchor(url_data)

    def add_anchor (self, url, line, column, name, base):
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@ -29,8 +29,8 @@ class TestLinkparser (unittest.TestCase):

    def _test_one_link (self, content, url):
        self.count_url = 0
-        h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
-        htmlsoup.process_soup(h, htmlsoup.make_soup(content))
+        linkparse.find_links(htmlsoup.make_soup(content),
+                             self._test_one_url(url), linkparse.LinkTags)
        self.assertEqual(self.count_url, 1)

    def _test_one_url (self, origurl):
@ -43,8 +43,8 @@ class TestLinkparser (unittest.TestCase):
    def _test_no_link (self, content):
        def callback (url, line, column, name, base):
            self.assertTrue(False, 'URL %r found' % url)
-        h = linkparse.LinkFinder(callback, linkparse.LinkTags)
-        htmlsoup.process_soup(h, htmlsoup.make_soup(content))
+        linkparse.find_links(htmlsoup.make_soup(content), callback,
+                             linkparse.LinkTags)

    def test_href_parsing (self):
        # Test <a href> parsing.