Move HtmlParser/htmlsax.py to htmlutil/htmlsoup.py

Remove one subpackage and some import lines where htmlutil.linkparse is
also being used.
This commit is contained in:
Chris Mayo 2020-04-10 16:19:33 +01:00
parent 0f18c9b8f0
commit ee6628a831
10 changed files with 46 additions and 70 deletions

View file

@ -1,49 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML parser module.
USAGE
Two functions are provided, one to make a BeautifulSoup object from markup and
another to call a handler's callback for each element in a BeautifulSoup
object it can process.
The used callback of a handler is:
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
def start_element (tag, attrs, text, line, column)
@param tag: tag name
@type tag: string
@param attrs: tag attributes
@type attrs: dict
@param text: element text
@type tag: string
@param line: tag line number
@type tag: integer
@param column: tag column number
@type tag: integer
EXAMPLE
# Create a new BeautifulSoup object.
soup = HtmlParser.htmlsax.make_soup("<html><body>Blubb</body></html>")
# Process the soup with the chosen handler as a parameter.
HtmlParser.htmlsax.proces_soup(handler, soup)
"""

View file

@ -30,8 +30,7 @@ from io import BytesIO
from .. import (log, LOG_CHECK, strformat, mimeutil,
url as urlutil, LinkCheckerError, httputil)
from . import (internpaturl, proxysupport)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from ..htmlutil import htmlsoup, linkparse
# import warnings
from .const import WARN_HTTP_EMPTY_CONTENT
from requests.sessions import REDIRECT_STATI
@ -83,7 +82,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
handler = linkparse.MetaRobotsFinder()
# parse
try:
htmlsax.process_soup(handler, self.get_soup())
htmlsoup.process_soup(handler, self.get_soup())
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
@ -302,7 +301,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def get_content(self):
if self.text is None:
self.get_raw_content()
self.soup = htmlsax.make_soup(self.data, self.encoding)
self.soup = htmlsoup.make_soup(self.data, self.encoding)
self.text = self.data.decode(self.soup.original_encoding)
return self.text

View file

@ -45,7 +45,7 @@ from future.utils import python_2_unicode_compatible
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
from ..HtmlParser import htmlsax
from ..htmlutil import htmlsoup
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@ -651,7 +651,7 @@ class UrlBase (object):
def get_content (self):
if self.text is None:
self.get_raw_content()
self.soup = htmlsax.make_soup(self.data)
self.soup = htmlsoup.make_soup(self.data)
self.text = self.data.decode(self.soup.original_encoding)
self.encoding = self.soup.original_encoding
return self.text

View file

@ -17,7 +17,7 @@
"""
HTML form utils
"""
from ..HtmlParser import htmlsax
from ..htmlutil import htmlsoup
from .. import log, LOG_CHECK
class Form(object):
@ -41,7 +41,7 @@ def search_form(content, cgiuser, cgipassword):
"""Search for a HTML form in the given HTML content that has the given
CGI fields. If no form is found return None.
"""
soup = htmlsax.make_soup(content)
soup = htmlsoup.make_soup(content)
# The value of the name attribute is case-insensitive
# https://www.w3.org/TR/html401/interact/forms.html#adef-name-INPUT
cginames = {cgiuser.lower(), cgipassword.lower()}

View file

@ -15,6 +15,35 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML parser implemented using Beautiful Soup and html.parser.
USAGE
Two functions are provided, one to make a BeautifulSoup object from markup and
another to call a handler's callbacks for each element in a BeautifulSoup
object it can process.
The used callback of a handler is:
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
def start_element (tag, attrs, text, line, column)
@param tag: tag name
@type tag: string
@param attrs: tag attributes
@type attrs: dict
@param text: element text
@type tag: string
@param line: tag line number
@type tag: integer
@param column: tag column number
@type tag: integer
EXAMPLE
# Create a new BeautifulSoup object.
soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
# Process the soup with the chosen handler as a parameter.
htmlutil.htmlsoup.proces_soup(handler, soup)
"""
from warnings import filterwarnings

View file

@ -99,7 +99,7 @@ class StopParse(Exception):
class TagFinder (object):
"""Base class handling HTML start elements.
TagFinder instances are used as HtmlParser handlers."""
TagFinder instances are used as HTML parser handlers."""
def __init__ (self):
"""Initialize local variables."""

View file

@ -18,8 +18,7 @@
Main functions for link parsing
"""
from .. import log, LOG_CHECK, strformat, url as urlutil
from ..htmlutil import linkparse
from ..HtmlParser import htmlsax
from ..htmlutil import htmlsoup, linkparse
from ..bookmarks import firefox
@ -121,7 +120,7 @@ def find_links (url_data, callback, tags):
handler = linkparse.LinkFinder(callback, tags)
# parse
try:
htmlsax.process_soup(handler, url_data.get_soup())
htmlsoup.process_soup(handler, url_data.get_soup())
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass

View file

@ -382,7 +382,6 @@ setup(
'linkcheck.configuration',
'linkcheck.director',
'linkcheck.htmlutil',
'linkcheck.HtmlParser',
'linkcheck.logger',
'linkcheck.network',
'linkcheck.parser',

View file

@ -19,8 +19,7 @@ Test linkparser routines.
"""
import unittest
from linkcheck.htmlutil import linkparse
from linkcheck.HtmlParser import htmlsax
from linkcheck.htmlutil import htmlsoup, linkparse
class TestLinkparser (unittest.TestCase):
@ -32,7 +31,7 @@ class TestLinkparser (unittest.TestCase):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
try:
htmlsax.process_soup(h, htmlsax.make_soup(content))
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
except linkparse.StopParse:
pass
self.assertEqual(self.count_url, 1)
@ -49,7 +48,7 @@ class TestLinkparser (unittest.TestCase):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
try:
htmlsax.process_soup(h, htmlsax.make_soup(content))
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
except linkparse.StopParse:
pass

View file

@ -18,7 +18,7 @@
Test html parsing.
"""
from linkcheck.HtmlParser import htmlsax
from linkcheck.htmlutil import htmlsoup
from io import StringIO
import unittest
@ -143,7 +143,7 @@ class TestParser (unittest.TestCase):
# Parse all test patterns in one go.
out = StringIO()
handler = HtmlPrettyPrinter(out)
htmlsax.process_soup(handler, htmlsax.make_soup(_in))
htmlsoup.process_soup(handler, htmlsoup.make_soup(_in))
self.check_results(_in, _out, out)
def check_results (self, _in, _out, out):
@ -182,6 +182,6 @@ class TestParser (unittest.TestCase):
def encoding_test (self, html, expected):
out = StringIO()
handler = HtmlPrettyPrinter(out)
soup = htmlsax.make_soup(html)
htmlsax.process_soup(handler, soup)
soup = htmlsoup.make_soup(html)
htmlsoup.process_soup(handler, soup)
self.assertEqual(soup.original_encoding, expected)