Stop using HTML handlers

LinkFinder is the only remaining HTML handler therefore no need for
htmlsoup.process_soup() as an independent function or TagFinder as a
base class.
This commit is contained in:
Chris Mayo 2020-04-29 20:07:00 +01:00
parent a1433767e5
commit 9eed070a73
5 changed files with 23 additions and 69 deletions

View file

@ -15,35 +15,6 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML parser implemented using Beautiful Soup and html.parser.
USAGE
Two functions are provided, one to make a BeautifulSoup object from markup and
another to call a handler's callbacks for each element in a BeautifulSoup
object it can process.
The used callback of a handler is:
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
def start_element (tag, attrs, text, line, column)
@param tag: tag name
@type tag: string
@param attrs: tag attributes
@type attrs: dict
@param text: element text
@type tag: string
@param line: tag line number
@type tag: integer
@param column: tag column number
@type tag: integer
EXAMPLE
# Create a new BeautifulSoup object.
soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
# Process the soup with the chosen handler as a parameter.
htmlutil.htmlsoup.proces_soup(handler, soup)
"""
from warnings import filterwarnings
@ -58,10 +29,3 @@ from bs4 import BeautifulSoup
def make_soup(markup, from_encoding=None):
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
multi_valued_attributes=None)
def process_soup(handler, soup):
for element in soup.find_all(True):
handler.start_element(
element.name, element.attrs, element.text.strip(),
element.sourceline,
None if element.sourcepos is None else element.sourcepos + 1)

View file

@ -98,19 +98,6 @@ def strip_c_comments (text):
return c_comment_re.sub('', text)
class TagFinder (object):
"""Base class handling HTML start elements.
TagFinder instances are used as HTML parser handlers."""
def __init__ (self):
"""Initialize local variables."""
super(TagFinder, self).__init__()
def start_element (self, tag, attrs, element_text, lineno, column):
"""Does nothing, override in a subclass."""
pass
def is_meta_url (attr, attrs):
"""Check if the meta attributes contain a URL."""
res = False
@ -133,13 +120,12 @@ def is_form_get(attr, attrs):
return res
class LinkFinder (TagFinder):
class LinkFinder:
"""Find HTML links, and apply them to the callback function with the
format (url, lineno, column, name, codebase)."""
def __init__ (self, callback, tags):
"""Store content in buffer and initialize URL list."""
super(LinkFinder, self).__init__()
self.callback = callback
# set universal tag attributes using tagname None
self.universal_attrs = set(tags.get(None, []))
@ -150,7 +136,7 @@ class LinkFinder (TagFinder):
self.tags[tag].update(self.universal_attrs)
self.base_ref = u''
def start_element (self, tag, attrs, element_text, lineno, column):
def html_element (self, tag, attrs, element_text, lineno, column):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d", lineno, column)
@ -226,3 +212,15 @@ class LinkFinder (TagFinder):
"""Add newly found URL to queue."""
assert isinstance(url, str_text) or url is None, repr(url)
self.callback(url, line=lineno, column=column, name=name, base=base)
def find_links(soup, callback, tags):
"""Parse into content and search for URLs to check.
When a URL is found it is passed to the supplied callback.
"""
lf = LinkFinder(callback, tags)
for element in soup.find_all(True):
lf.html_element(
element.name, element.attrs, element.text.strip(),
element.sourceline,
None if element.sourcepos is None else element.sourcepos + 1)

View file

@ -18,7 +18,7 @@
Main functions for link parsing
"""
from .. import log, LOG_CHECK, strformat, url as urlutil
from ..htmlutil import htmlsoup, linkparse
from ..htmlutil import linkparse
from ..bookmarks import firefox
@ -46,7 +46,7 @@ def parse_html (url_data):
"""Parse into HTML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
find_links(url_data, url_data.add_url, linkparse.LinkTags)
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.LinkTags)
def parse_opera (url_data):
@ -112,15 +112,7 @@ def parse_wml (url_data):
"""Parse into WML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
find_links(url_data, url_data.add_url, linkparse.WmlTags)
def find_links (url_data, callback, tags):
"""Parse into content and search for URLs to check.
Found URLs are added to the URL queue.
"""
handler = linkparse.LinkFinder(callback, tags)
htmlsoup.process_soup(handler, url_data.get_soup())
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.WmlTags)
def parse_firefox (url_data):

View file

@ -22,7 +22,6 @@ from urllib import parse
from . import _ContentPlugin
from .. import log, LOG_PLUGIN
from ..htmlutil import linkparse
from ..parser import find_links
class AnchorCheck(_ContentPlugin):
@ -37,7 +36,8 @@ class AnchorCheck(_ContentPlugin):
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
# list of parsed anchors
self.anchors = []
find_links(url_data, self.add_anchor, linkparse.AnchorTags)
linkparse.find_links(url_data.get_soup(), self.add_anchor,
linkparse.AnchorTags)
self.check_anchor(url_data)
def add_anchor (self, url, line, column, name, base):

View file

@ -29,8 +29,8 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
linkparse.find_links(htmlsoup.make_soup(content),
self._test_one_url(url), linkparse.LinkTags)
self.assertEqual(self.count_url, 1)
def _test_one_url (self, origurl):
@ -43,8 +43,8 @@ class TestLinkparser (unittest.TestCase):
def _test_no_link (self, content):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
linkparse.find_links(htmlsoup.make_soup(content), callback,
linkparse.LinkTags)
def test_href_parsing (self):
# Test <a href> parsing.