Replace Parser class using BeautifulSoup.find_all()

This commit is contained in:
Chris Mayo 2020-04-09 20:15:15 +01:00
parent eb3cf28baa
commit 0795e3c1b4
6 changed files with 33 additions and 71 deletions

View file

@ -19,44 +19,31 @@ HTML parser module.
USAGE
First make a HTML SAX handler object. Missing callback functions are
ignored. The object returned from callbacks is also ignored.
Note that a missing attribute value is stored as the value None
in the ListDict (ie. "<a href>" with lead to a {href: None} dict entry).
Used callbacks of a handler are:
Two functions are provided, one to make a BeautifulSoup object from markup and
another to call a handler's callback for each element in a BeautifulSoup
object it can process.
The used callback of a handler is:
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
def start_element (tag, attrs)
def start_element (tag, attrs, text, line, column)
@param tag: tag name
@type tag: Unicode string
@type tag: string
@param attrs: tag attributes
@type attrs: ListDict
Additionally, there are error and warning callbacks:
- Parser warning.
def warning (msg)
@param msg: warning message
@type msg: Unicode string
- Parser error.
def error (msg)
@param msg: error message
@type msg: Unicode string
- Fatal parser error
def fatal_error (msg)
@param msg: error message
@type msg: Unicode string
@type attrs: dict
@param text: element text
@type tag: string
@param line: tag line number
@type tag: integer
@param column: tag column number
@type tag: integer
EXAMPLE
# Create a new HTML parser object with the handler as parameter.
parser = HtmlParser.htmlsax.parser(handler)
# Feed data.
parser.feed("<html><body>Blubb</body></html>")
# Flush for finishing things up.
parser.flush()
# Create a new BeautifulSoup object.
soup = HtmlParser.htmlsax.make_soup("<html><body>Blubb</body></html>")
# Process the soup with the chosen handler as a parameter.
HtmlParser.htmlsax.proces_soup(handler, soup)
"""

View file

@ -23,34 +23,16 @@ filterwarnings("ignore",
message="The soupsieve package is not installed. CSS selectors cannot be used.",
category=UserWarning, module="bs4")
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
def make_soup(markup, from_encoding=None):
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
multi_valued_attributes=None)
class Parser(object):
handler = None
def __init__(self, handler):
self.handler = handler
def feed_soup(self, soup):
self.parse_contents(soup.contents)
def parse_contents(self, contents):
for content in contents:
if isinstance(content, Tag):
self.handler.start_element(
content.name, content.attrs, content.text.strip(),
content.sourceline,
None if content.sourcepos is None
else content.sourcepos + 1
)
if hasattr(content, 'contents'): # recursion
self.parse_contents(content.contents)
def parser(handler=None):
return Parser(handler)
def process_soup(handler, soup):
for element in soup.find_all(True):
handler.start_element(
element.name, element.attrs, element.text.strip(),
element.sourceline,
None if element.sourcepos is None else element.sourcepos + 1)

View file

@ -79,12 +79,11 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
if not self.is_html():
return True
# construct parser object
# construct handler object
handler = linkparse.MetaRobotsFinder()
parser = htmlsax.parser(handler)
# parse
try:
parser.feed_soup(self.get_soup())
htmlsax.process_soup(handler, self.get_soup())
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass

View file

@ -117,13 +117,11 @@ def find_links (url_data, callback, tags):
"""Parse into content and search for URLs to check.
Found URLs are added to the URL queue.
"""
# construct parser object
# construct handler object
handler = linkparse.LinkFinder(callback, tags)
parser = htmlsax.parser(handler)
# parse
try:
soup = url_data.get_soup()
parser.feed_soup(soup)
htmlsax.process_soup(handler, url_data.get_soup())
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass

View file

@ -31,9 +31,8 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
p = htmlsax.parser(h)
try:
p.feed_soup(htmlsax.make_soup(content))
htmlsax.process_soup(h, htmlsax.make_soup(content))
except linkparse.StopParse:
pass
self.assertEqual(self.count_url, 1)
@ -49,9 +48,8 @@ class TestLinkparser (unittest.TestCase):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
p = htmlsax.parser(h)
try:
p.feed_soup(htmlsax.make_soup(content))
htmlsax.process_soup(h, htmlsax.make_soup(content))
except linkparse.StopParse:
pass

View file

@ -143,8 +143,7 @@ class TestParser (unittest.TestCase):
# Parse all test patterns in one go.
out = StringIO()
handler = HtmlPrettyPrinter(out)
parser = htmlsax.parser(handler)
parser.feed_soup(htmlsax.make_soup(_in))
htmlsax.process_soup(handler, htmlsax.make_soup(_in))
self.check_results(_in, _out, out)
def check_results (self, _in, _out, out):
@ -183,7 +182,6 @@ class TestParser (unittest.TestCase):
def encoding_test (self, html, expected):
out = StringIO()
handler = HtmlPrettyPrinter(out)
parser = htmlsax.parser(handler)
soup = htmlsax.make_soup(html)
parser.feed_soup(soup)
htmlsax.process_soup(handler, soup)
self.assertEqual(soup.original_encoding, expected)