mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-25 08:34:43 +00:00
Replace Parser class using BeautifulSoup.find_all()
This commit is contained in:
parent
eb3cf28baa
commit
0795e3c1b4
6 changed files with 33 additions and 71 deletions
|
|
@ -19,44 +19,31 @@ HTML parser module.
|
|||
|
||||
USAGE
|
||||
|
||||
First make a HTML SAX handler object. Missing callback functions are
|
||||
ignored. The object returned from callbacks is also ignored.
|
||||
Note that a missing attribute value is stored as the value None
|
||||
in the ListDict (ie. "<a href>" with lead to a {href: None} dict entry).
|
||||
|
||||
Used callbacks of a handler are:
|
||||
Two functions are provided, one to make a BeautifulSoup object from markup and
|
||||
another to call a handler's callback for each element in a BeautifulSoup
|
||||
object it can process.
|
||||
|
||||
The used callback of a handler is:
|
||||
|
||||
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
|
||||
def start_element (tag, attrs)
|
||||
def start_element (tag, attrs, text, line, column)
|
||||
@param tag: tag name
|
||||
@type tag: Unicode string
|
||||
@type tag: string
|
||||
@param attrs: tag attributes
|
||||
@type attrs: ListDict
|
||||
|
||||
Additionally, there are error and warning callbacks:
|
||||
|
||||
- Parser warning.
|
||||
def warning (msg)
|
||||
@param msg: warning message
|
||||
@type msg: Unicode string
|
||||
|
||||
- Parser error.
|
||||
def error (msg)
|
||||
@param msg: error message
|
||||
@type msg: Unicode string
|
||||
|
||||
- Fatal parser error
|
||||
def fatal_error (msg)
|
||||
@param msg: error message
|
||||
@type msg: Unicode string
|
||||
@type attrs: dict
|
||||
@param text: element text
|
||||
@type tag: string
|
||||
@param line: tag line number
|
||||
@type tag: integer
|
||||
@param column: tag column number
|
||||
@type tag: integer
|
||||
|
||||
EXAMPLE
|
||||
|
||||
# Create a new HTML parser object with the handler as parameter.
|
||||
parser = HtmlParser.htmlsax.parser(handler)
|
||||
# Feed data.
|
||||
parser.feed("<html><body>Blubb</body></html>")
|
||||
# Flush for finishing things up.
|
||||
parser.flush()
|
||||
# Create a new BeautifulSoup object.
|
||||
soup = HtmlParser.htmlsax.make_soup("<html><body>Blubb</body></html>")
|
||||
# Process the soup with the chosen handler as a parameter.
|
||||
HtmlParser.htmlsax.proces_soup(handler, soup)
|
||||
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -23,34 +23,16 @@ filterwarnings("ignore",
|
|||
message="The soupsieve package is not installed. CSS selectors cannot be used.",
|
||||
category=UserWarning, module="bs4")
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def make_soup(markup, from_encoding=None):
|
||||
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
|
||||
multi_valued_attributes=None)
|
||||
|
||||
class Parser(object):
|
||||
handler = None
|
||||
|
||||
def __init__(self, handler):
|
||||
self.handler = handler
|
||||
|
||||
def feed_soup(self, soup):
|
||||
self.parse_contents(soup.contents)
|
||||
|
||||
def parse_contents(self, contents):
|
||||
for content in contents:
|
||||
if isinstance(content, Tag):
|
||||
self.handler.start_element(
|
||||
content.name, content.attrs, content.text.strip(),
|
||||
content.sourceline,
|
||||
None if content.sourcepos is None
|
||||
else content.sourcepos + 1
|
||||
)
|
||||
if hasattr(content, 'contents'): # recursion
|
||||
self.parse_contents(content.contents)
|
||||
|
||||
|
||||
def parser(handler=None):
|
||||
return Parser(handler)
|
||||
def process_soup(handler, soup):
|
||||
for element in soup.find_all(True):
|
||||
handler.start_element(
|
||||
element.name, element.attrs, element.text.strip(),
|
||||
element.sourceline,
|
||||
None if element.sourcepos is None else element.sourcepos + 1)
|
||||
|
|
|
|||
|
|
@ -79,12 +79,11 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
if not self.is_html():
|
||||
return True
|
||||
# construct parser object
|
||||
# construct handler object
|
||||
handler = linkparse.MetaRobotsFinder()
|
||||
parser = htmlsax.parser(handler)
|
||||
# parse
|
||||
try:
|
||||
parser.feed_soup(self.get_soup())
|
||||
htmlsax.process_soup(handler, self.get_soup())
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -117,13 +117,11 @@ def find_links (url_data, callback, tags):
|
|||
"""Parse into content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
# construct parser object
|
||||
# construct handler object
|
||||
handler = linkparse.LinkFinder(callback, tags)
|
||||
parser = htmlsax.parser(handler)
|
||||
# parse
|
||||
try:
|
||||
soup = url_data.get_soup()
|
||||
parser.feed_soup(soup)
|
||||
htmlsax.process_soup(handler, url_data.get_soup())
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -31,9 +31,8 @@ class TestLinkparser (unittest.TestCase):
|
|||
def _test_one_link (self, content, url):
|
||||
self.count_url = 0
|
||||
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
|
||||
p = htmlsax.parser(h)
|
||||
try:
|
||||
p.feed_soup(htmlsax.make_soup(content))
|
||||
htmlsax.process_soup(h, htmlsax.make_soup(content))
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
self.assertEqual(self.count_url, 1)
|
||||
|
|
@ -49,9 +48,8 @@ class TestLinkparser (unittest.TestCase):
|
|||
def callback (url, line, column, name, base):
|
||||
self.assertTrue(False, 'URL %r found' % url)
|
||||
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
|
||||
p = htmlsax.parser(h)
|
||||
try:
|
||||
p.feed_soup(htmlsax.make_soup(content))
|
||||
htmlsax.process_soup(h, htmlsax.make_soup(content))
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
|
||||
|
|
|
|||
|
|
@ -143,8 +143,7 @@ class TestParser (unittest.TestCase):
|
|||
# Parse all test patterns in one go.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
parser = htmlsax.parser(handler)
|
||||
parser.feed_soup(htmlsax.make_soup(_in))
|
||||
htmlsax.process_soup(handler, htmlsax.make_soup(_in))
|
||||
self.check_results(_in, _out, out)
|
||||
|
||||
def check_results (self, _in, _out, out):
|
||||
|
|
@ -183,7 +182,6 @@ class TestParser (unittest.TestCase):
|
|||
def encoding_test (self, html, expected):
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
parser = htmlsax.parser(handler)
|
||||
soup = htmlsax.make_soup(html)
|
||||
parser.feed_soup(soup)
|
||||
htmlsax.process_soup(handler, soup)
|
||||
self.assertEqual(soup.original_encoding, expected)
|
||||
|
|
|
|||
Loading…
Reference in a new issue