mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-24 01:40:23 +00:00
Use parser.feed_soup() instead of parser.feed()
Markup is not being passed in pieces to the parser, so simplify the interface and reduce the state further.
This commit is contained in:
parent
40f43ae41c
commit
3771dd9136
5 changed files with 15 additions and 111 deletions
|
|
@ -17,7 +17,6 @@
|
|||
HTML parser implemented using Beautiful Soup and html.parser.
|
||||
"""
|
||||
|
||||
from io import BytesIO, StringIO
|
||||
from warnings import filterwarnings
|
||||
|
||||
filterwarnings("ignore",
|
||||
|
|
@ -39,20 +38,11 @@ class Parser(object):
|
|||
self.handler = handler
|
||||
self.reset()
|
||||
|
||||
def feed(self, feed_text):
|
||||
if not self.html_doc:
|
||||
if isinstance(feed_text, bytes):
|
||||
self.html_doc = BytesIO()
|
||||
else:
|
||||
self.html_doc = StringIO()
|
||||
self.html_doc.write(feed_text)
|
||||
|
||||
def feed_soup(self, soup):
|
||||
self.soup = soup
|
||||
|
||||
def reset(self):
|
||||
self.soup = None
|
||||
self.html_doc = None
|
||||
|
||||
def parse_contents(self, contents):
|
||||
for content in contents:
|
||||
|
|
@ -75,9 +65,6 @@ class Parser(object):
|
|||
self.handler.end_element(content.name)
|
||||
|
||||
def flush(self):
|
||||
if self.soup is None:
|
||||
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
|
||||
multi_valued_attributes=None)
|
||||
if hasattr(self.soup, 'contents'):
|
||||
self.parse_contents(self.soup.contents)
|
||||
self.encoding = self.soup.original_encoding
|
||||
|
|
|
|||
|
|
@ -85,7 +85,7 @@ def search_form(content, cgiuser, cgipassword):
|
|||
handler = FormFinder()
|
||||
parser = htmlsax.parser(handler)
|
||||
# parse
|
||||
parser.feed(content)
|
||||
parser.feed_soup(htmlsax.make_soup(content))
|
||||
parser.flush()
|
||||
log.debug(LOG_CHECK, "Found forms %s", handler.forms)
|
||||
cginames = (cgiuser.lower(), cgipassword.lower())
|
||||
|
|
|
|||
|
|
@ -15,50 +15,12 @@
|
|||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Default HTML parser handler classes.
|
||||
HTML parser handler test class.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
class HtmlPrinter:
|
||||
"""
|
||||
Handles all functions by printing the function name and attributes.
|
||||
"""
|
||||
|
||||
def __init__ (self, fd=sys.stdout):
|
||||
"""
|
||||
Write to given file descriptor.
|
||||
|
||||
@param fd: file like object (default=sys.stdout)
|
||||
@type fd: file
|
||||
"""
|
||||
self.fd = fd
|
||||
|
||||
def _print (self, *attrs):
|
||||
"""
|
||||
Print function attributes to stored file descriptor.
|
||||
|
||||
@param attrs: list of values to print
|
||||
@type attrs: tuple
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write(self.mem)
|
||||
self.fd.write(str(attrs))
|
||||
|
||||
def __getattr__ (self, name):
|
||||
"""
|
||||
Remember the called method name in self.mem.
|
||||
|
||||
@param name: attribute name
|
||||
@type name: string
|
||||
@return: method which just prints out its arguments
|
||||
@rtype: a bound function object
|
||||
"""
|
||||
self.mem = name
|
||||
return self._print
|
||||
|
||||
|
||||
class HtmlPrettyPrinter:
|
||||
"""
|
||||
Print out all parsed HTML data in encoded form.
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Test linkparser routines.
|
|||
|
||||
import unittest
|
||||
from linkcheck.htmlutil import linkparse
|
||||
import linkcheck.HtmlParser.htmlsax
|
||||
from linkcheck.HtmlParser import htmlsax
|
||||
|
||||
|
||||
class TestLinkparser (unittest.TestCase):
|
||||
|
|
@ -31,9 +31,9 @@ class TestLinkparser (unittest.TestCase):
|
|||
def _test_one_link (self, content, url):
|
||||
self.count_url = 0
|
||||
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
p = htmlsax.parser(h)
|
||||
try:
|
||||
p.feed(content)
|
||||
p.feed_soup(htmlsax.make_soup(content))
|
||||
p.flush()
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
|
|
@ -50,9 +50,9 @@ class TestLinkparser (unittest.TestCase):
|
|||
def callback (url, line, column, name, base):
|
||||
self.assertTrue(False, 'URL %r found' % url)
|
||||
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
p = htmlsax.parser(h)
|
||||
try:
|
||||
p.feed(content)
|
||||
p.feed_soup(htmlsax.make_soup(content))
|
||||
p.flush()
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -18,14 +18,14 @@
|
|||
Test html parsing.
|
||||
"""
|
||||
|
||||
import linkcheck.HtmlParser.htmlsax
|
||||
from linkcheck.HtmlParser import htmlsax
|
||||
|
||||
from io import StringIO
|
||||
import unittest
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from .htmllib import HtmlPrinter, HtmlPrettyPrinter
|
||||
from .htmllib import HtmlPrettyPrinter
|
||||
|
||||
# list of tuples
|
||||
# (<test pattern>, <expected parse output>)
|
||||
|
|
@ -137,21 +137,14 @@ class TestParser (unittest.TestCase):
|
|||
Test html parser.
|
||||
"""
|
||||
|
||||
def setUp (self):
|
||||
"""
|
||||
Initialize two internal html parsers to be used for testing.
|
||||
"""
|
||||
self.htmlparser = linkcheck.HtmlParser.htmlsax.parser()
|
||||
self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser()
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_parse (self, _in, _out):
|
||||
# Parse all test patterns in one go.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
self.htmlparser.feed(_in)
|
||||
self.check_results(self.htmlparser, _in, _out, out)
|
||||
parser = htmlsax.parser(handler)
|
||||
parser.feed_soup(htmlsax.make_soup(_in))
|
||||
self.check_results(_in, _out, out)
|
||||
|
||||
def check_results (self, htmlparser, _in, _out, out):
|
||||
"""
|
||||
|
|
@ -164,44 +157,6 @@ class TestParser (unittest.TestCase):
|
|||
self.assertEqual(res, _out, msg=msg)
|
||||
htmlparser.reset()
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_feed (self, _in, _out):
|
||||
# Parse all test patterns sequentially.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.check_results(self.htmlparser, _in, _out, out)
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_interwoven (self, _in, _out):
|
||||
# Parse all test patterns on two parsers interwoven.
|
||||
out = StringIO()
|
||||
out2 = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
handler2 = HtmlPrettyPrinter(out2)
|
||||
self.htmlparser2.handler = handler2
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.htmlparser2.feed(c)
|
||||
self.check_results(self.htmlparser, _in, _out, out)
|
||||
self.check_results(self.htmlparser2, _in, _out, out2)
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_handler (self, _in, _out):
|
||||
out = StringIO()
|
||||
out2 = StringIO()
|
||||
handler = HtmlPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
handler2 = HtmlPrinter(out2)
|
||||
self.htmlparser2.handler = handler2
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.htmlparser2.feed(c)
|
||||
self.assertEqual(out.getvalue(), out2.getvalue())
|
||||
|
||||
def test_encoding_detection_utf_content (self):
|
||||
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
|
||||
self.encoding_test(html, "utf-8")
|
||||
|
|
@ -227,11 +182,11 @@ class TestParser (unittest.TestCase):
|
|||
self.encoding_test(html, "ascii")
|
||||
|
||||
def encoding_test (self, html, expected):
|
||||
parser = linkcheck.HtmlParser.htmlsax.parser()
|
||||
parser = htmlsax.parser()
|
||||
self.assertEqual(parser.encoding, None)
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
parser.handler = handler
|
||||
parser.feed(html)
|
||||
parser = htmlsax.parser(handler)
|
||||
parser.feed_soup(htmlsax.make_soup(html))
|
||||
parser.flush()
|
||||
self.assertEqual(parser.encoding, expected)
|
||||
|
|
|
|||
Loading…
Reference in a new issue