Replace Parser class using BeautifulSoup.find_all()

2026-04-25 08:34:43 +00:00 · 2020-04-09 20:15:15 +01:00 · 2020-04-09 20:15:15 +01:00 · 0795e3c1b4
commit 0795e3c1b4
parent eb3cf28baa
6 changed files with 33 additions and 71 deletions
--- a/linkcheck/HtmlParser/init.py
+++ b/linkcheck/HtmlParser/init.py
@ -19,44 +19,31 @@ HTML parser module.

 USAGE

-First make a HTML SAX handler object. Missing callback functions are
-ignored. The object returned from callbacks is also ignored.
-Note that a missing attribute value is stored as the value None
-in the ListDict (ie. "<a href>" with lead to a {href: None} dict entry).

-Used callbacks of a handler are:
+Two functions are provided, one to make a BeautifulSoup object from markup and
+another to call a handler's callback for each element in a BeautifulSoup
+object it can process.
+
+The used callback of a handler is:

 - Start tag: <tag {attr1:value1, attr2:value2, ..}>
-  def start_element (tag, attrs)
+  def start_element (tag, attrs, text, line, column)
  @param tag: tag name
-  @type tag: Unicode string
+  @type tag: string
  @param attrs: tag attributes
-  @type attrs: ListDict
-
-Additionally, there are error and warning callbacks:
-
- Parser warning.
-  def warning (msg)
-  @param msg: warning message
-  @type msg: Unicode string
-
- Parser error.
-  def error (msg)
-  @param msg: error message
-  @type msg: Unicode string
-
- Fatal parser error
-  def fatal_error (msg)
-  @param msg: error message
-  @type msg: Unicode string
+  @type attrs: dict
+  @param text: element text
+  @type tag: string
+  @param line: tag line number
+  @type tag: integer
+  @param column: tag column number
+  @type tag: integer

 EXAMPLE

- # Create a new HTML parser object with the handler as parameter.
- parser = HtmlParser.htmlsax.parser(handler)
- # Feed data.
- parser.feed("<html><body>Blubb</body></html>")
- # Flush for finishing things up.
- parser.flush()
+ # Create a new BeautifulSoup object.
+ soup = HtmlParser.htmlsax.make_soup("<html><body>Blubb</body></html>")
+ # Process the soup with the chosen handler as a parameter.
+ HtmlParser.htmlsax.proces_soup(handler, soup)

 """
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@ -23,34 +23,16 @@ filterwarnings("ignore",
    message="The soupsieve package is not installed. CSS selectors cannot be used.",
    category=UserWarning, module="bs4")

-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup


 def make_soup(markup, from_encoding=None):
    return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
                         multi_valued_attributes=None)

-class Parser(object):
-    handler = None
-
-    def __init__(self, handler):
-        self.handler = handler
-
-    def feed_soup(self, soup):
-        self.parse_contents(soup.contents)
-
-    def parse_contents(self, contents):
-        for content in contents:
-            if isinstance(content, Tag):
-                self.handler.start_element(
-                    content.name, content.attrs, content.text.strip(),
-                    content.sourceline,
-                    None if content.sourcepos is None
-                    else content.sourcepos + 1
-                )
-                if hasattr(content, 'contents'):  # recursion
-                    self.parse_contents(content.contents)
-
-
-def parser(handler=None):
-    return Parser(handler)
+def process_soup(handler, soup):
+    for element in soup.find_all(True):
+        handler.start_element(
+            element.name, element.attrs, element.text.strip(),
+            element.sourceline,
+            None if element.sourcepos is None else element.sourcepos + 1)
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -79,12 +79,11 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """
        if not self.is_html():
            return True
-        # construct parser object
+        # construct handler object
        handler = linkparse.MetaRobotsFinder()
-        parser = htmlsax.parser(handler)
        # parse
        try:
-            parser.feed_soup(self.get_soup())
+            htmlsax.process_soup(handler, self.get_soup())
        except linkparse.StopParse as msg:
            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
            pass
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -117,13 +117,11 @@ def find_links (url_data, callback, tags):
    """Parse into content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
-    # construct parser object
+    # construct handler object
    handler = linkparse.LinkFinder(callback, tags)
-    parser = htmlsax.parser(handler)
    # parse
    try:
-        soup = url_data.get_soup()
-        parser.feed_soup(soup)
+        htmlsax.process_soup(handler, url_data.get_soup())
    except linkparse.StopParse as msg:
        log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
        pass
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@ -31,9 +31,8 @@ class TestLinkparser (unittest.TestCase):
    def _test_one_link (self, content, url):
        self.count_url = 0
        h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
-        p = htmlsax.parser(h)
        try:
-            p.feed_soup(htmlsax.make_soup(content))
+            htmlsax.process_soup(h, htmlsax.make_soup(content))
        except linkparse.StopParse:
            pass
        self.assertEqual(self.count_url, 1)
@ -49,9 +48,8 @@ class TestLinkparser (unittest.TestCase):
        def callback (url, line, column, name, base):
            self.assertTrue(False, 'URL %r found' % url)
        h = linkparse.LinkFinder(callback, linkparse.LinkTags)
-        p = htmlsax.parser(h)
        try:
-            p.feed_soup(htmlsax.make_soup(content))
+            htmlsax.process_soup(h, htmlsax.make_soup(content))
        except linkparse.StopParse:
            pass

--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -143,8 +143,7 @@ class TestParser (unittest.TestCase):
        # Parse all test patterns in one go.
        out = StringIO()
        handler = HtmlPrettyPrinter(out)
-        parser = htmlsax.parser(handler)
-        parser.feed_soup(htmlsax.make_soup(_in))
+        htmlsax.process_soup(handler, htmlsax.make_soup(_in))
        self.check_results(_in, _out, out)

    def check_results (self, _in, _out, out):
@ -183,7 +182,6 @@ class TestParser (unittest.TestCase):
    def encoding_test (self, html, expected):
        out = StringIO()
        handler = HtmlPrettyPrinter(out)
-        parser = htmlsax.parser(handler)
        soup = htmlsax.make_soup(html)
-        parser.feed_soup(soup)
+        htmlsax.process_soup(handler, soup)
        self.assertEqual(soup.original_encoding, expected)