Move HtmlParser/htmlsax.py to htmlutil/htmlsoup.py

Remove one subpackage and some import lines where htmlutil.linkparse is also being used.
2026-05-08 14:44:46 +00:00 · 2020-04-10 16:19:33 +01:00 · 2020-04-10 16:19:33 +01:00 · ee6628a831
commit ee6628a831
parent 0f18c9b8f0
10 changed files with 46 additions and 70 deletions
--- a/linkcheck/HtmlParser/init.py
+++ b/linkcheck/HtmlParser/init.py
@ -1,49 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2014 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-HTML parser module.
-
-USAGE
-
-
-Two functions are provided, one to make a BeautifulSoup object from markup and
-another to call a handler's callback for each element in a BeautifulSoup
-object it can process.
-
-The used callback of a handler is:
-
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
-  def start_element (tag, attrs, text, line, column)
-  @param tag: tag name
-  @type tag: string
-  @param attrs: tag attributes
-  @type attrs: dict
-  @param text: element text
-  @type tag: string
-  @param line: tag line number
-  @type tag: integer
-  @param column: tag column number
-  @type tag: integer
-
-EXAMPLE
-
- # Create a new BeautifulSoup object.
- soup = HtmlParser.htmlsax.make_soup("<html><body>Blubb</body></html>")
- # Process the soup with the chosen handler as a parameter.
- HtmlParser.htmlsax.proces_soup(handler, soup)
-
-"""
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -30,8 +30,7 @@ from io import BytesIO
 from .. import (log, LOG_CHECK, strformat, mimeutil,
    url as urlutil, LinkCheckerError, httputil)
 from . import (internpaturl, proxysupport)
-from ..HtmlParser import htmlsax
-from ..htmlutil import linkparse
+from ..htmlutil import htmlsoup, linkparse
 # import warnings
 from .const import WARN_HTTP_EMPTY_CONTENT
 from requests.sessions import REDIRECT_STATI
@ -83,7 +82,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        handler = linkparse.MetaRobotsFinder()
        # parse
        try:
-            htmlsax.process_soup(handler, self.get_soup())
+            htmlsoup.process_soup(handler, self.get_soup())
        except linkparse.StopParse as msg:
            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
            pass
@ -302,7 +301,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
    def get_content(self):
        if self.text is None:
            self.get_raw_content()
-            self.soup = htmlsax.make_soup(self.data, self.encoding)
+            self.soup = htmlsoup.make_soup(self.data, self.encoding)
            self.text = self.data.decode(self.soup.original_encoding)
        return self.text

--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -45,7 +45,7 @@ from future.utils import python_2_unicode_compatible
 from . import absolute_url, get_url_from
 from .. import (log, LOG_CHECK,
  strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
-from ..HtmlParser import htmlsax
+from ..htmlutil import htmlsoup
 from ..network import iputil
 from .const import (WARN_URL_EFFECTIVE_URL,
    WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@ -651,7 +651,7 @@ class UrlBase (object):
    def get_content (self):
        if self.text is None:
            self.get_raw_content()
-            self.soup = htmlsax.make_soup(self.data)
+            self.soup = htmlsoup.make_soup(self.data)
            self.text = self.data.decode(self.soup.original_encoding)
            self.encoding = self.soup.original_encoding
        return self.text
--- a/linkcheck/htmlutil/formsearch.py
+++ b/linkcheck/htmlutil/formsearch.py
@ -17,7 +17,7 @@
 """
 HTML form utils
 """
-from ..HtmlParser import htmlsax
+from ..htmlutil import htmlsoup
 from .. import log, LOG_CHECK

 class Form(object):
@ -41,7 +41,7 @@ def search_form(content, cgiuser, cgipassword):
    """Search for a HTML form in the given HTML content that has the given
    CGI fields. If no form is found return None.
    """
-    soup = htmlsax.make_soup(content)
+    soup = htmlsoup.make_soup(content)
    # The value of the name attribute is case-insensitive
    # https://www.w3.org/TR/html401/interact/forms.html#adef-name-INPUT
    cginames = {cgiuser.lower(), cgipassword.lower()}
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@ -15,6 +15,35 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
 HTML parser implemented using Beautiful Soup and html.parser.
+
+USAGE
+
+Two functions are provided, one to make a BeautifulSoup object from markup and
+another to call a handler's callbacks for each element in a BeautifulSoup
+object it can process.
+
+The used callback of a handler is:
+
+- Start tag: <tag {attr1:value1, attr2:value2, ..}>
+  def start_element (tag, attrs, text, line, column)
+  @param tag: tag name
+  @type tag: string
+  @param attrs: tag attributes
+  @type attrs: dict
+  @param text: element text
+  @type tag: string
+  @param line: tag line number
+  @type tag: integer
+  @param column: tag column number
+  @type tag: integer
+
+EXAMPLE
+
+ # Create a new BeautifulSoup object.
+ soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
+ # Process the soup with the chosen handler as a parameter.
+ htmlutil.htmlsoup.proces_soup(handler, soup)
+
 """

 from warnings import filterwarnings
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -99,7 +99,7 @@ class StopParse(Exception):

 class TagFinder (object):
    """Base class handling HTML start elements.
-    TagFinder instances are used as HtmlParser handlers."""
+    TagFinder instances are used as HTML parser handlers."""

    def __init__ (self):
        """Initialize local variables."""
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -18,8 +18,7 @@
 Main functions for link parsing
 """
 from .. import log, LOG_CHECK, strformat, url as urlutil
-from ..htmlutil import linkparse
-from ..HtmlParser import htmlsax
+from ..htmlutil import htmlsoup, linkparse
 from ..bookmarks import firefox


@ -121,7 +120,7 @@ def find_links (url_data, callback, tags):
    handler = linkparse.LinkFinder(callback, tags)
    # parse
    try:
-        htmlsax.process_soup(handler, url_data.get_soup())
+        htmlsoup.process_soup(handler, url_data.get_soup())
    except linkparse.StopParse as msg:
        log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
        pass
--- a/setup.py
+++ b/setup.py
@ -382,7 +382,6 @@ setup(
        'linkcheck.configuration',
        'linkcheck.director',
        'linkcheck.htmlutil',
-        'linkcheck.HtmlParser',
        'linkcheck.logger',
        'linkcheck.network',
        'linkcheck.parser',
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@ -19,8 +19,7 @@ Test linkparser routines.
 """

 import unittest
-from linkcheck.htmlutil import linkparse
-from linkcheck.HtmlParser import htmlsax
+from linkcheck.htmlutil import htmlsoup, linkparse


 class TestLinkparser (unittest.TestCase):
@ -32,7 +31,7 @@ class TestLinkparser (unittest.TestCase):
        self.count_url = 0
        h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
        try:
-            htmlsax.process_soup(h, htmlsax.make_soup(content))
+            htmlsoup.process_soup(h, htmlsoup.make_soup(content))
        except linkparse.StopParse:
            pass
        self.assertEqual(self.count_url, 1)
@ -49,7 +48,7 @@ class TestLinkparser (unittest.TestCase):
            self.assertTrue(False, 'URL %r found' % url)
        h = linkparse.LinkFinder(callback, linkparse.LinkTags)
        try:
-            htmlsax.process_soup(h, htmlsax.make_soup(content))
+            htmlsoup.process_soup(h, htmlsoup.make_soup(content))
        except linkparse.StopParse:
            pass

--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -18,7 +18,7 @@
 Test html parsing.
 """

-from linkcheck.HtmlParser import htmlsax
+from linkcheck.htmlutil import htmlsoup

 from io import StringIO
 import unittest
@ -143,7 +143,7 @@ class TestParser (unittest.TestCase):
        # Parse all test patterns in one go.
        out = StringIO()
        handler = HtmlPrettyPrinter(out)
-        htmlsax.process_soup(handler, htmlsax.make_soup(_in))
+        htmlsoup.process_soup(handler, htmlsoup.make_soup(_in))
        self.check_results(_in, _out, out)

    def check_results (self, _in, _out, out):
@ -182,6 +182,6 @@ class TestParser (unittest.TestCase):
    def encoding_test (self, html, expected):
        out = StringIO()
        handler = HtmlPrettyPrinter(out)
-        soup = htmlsax.make_soup(html)
-        htmlsax.process_soup(handler, soup)
+        soup = htmlsoup.make_soup(html)
+        htmlsoup.process_soup(handler, soup)
        self.assertEqual(soup.original_encoding, expected)