Merge pull request #357 from cjmayo/parser2

Simplify the Parser class
This commit is contained in:
anarcat 2020-04-09 15:22:14 -04:00 committed by GitHub
commit d80a075372
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 50 additions and 203 deletions

View file

@ -17,7 +17,6 @@
HTML parser implemented using Beautiful Soup and html.parser.
"""
from io import BytesIO, StringIO
from warnings import filterwarnings
filterwarnings("ignore",
@ -27,64 +26,39 @@ filterwarnings("ignore",
from bs4 import BeautifulSoup, Tag
def make_soup(markup, from_encoding=None):
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
multi_valued_attributes=None)
class Parser(object):
handler = None
encoding = None
def __init__(self, handler):
self.handler = handler
self.reset()
def feed(self, feed_text):
if not self.html_doc:
if isinstance(feed_text, bytes):
self.html_doc = BytesIO()
else:
self.html_doc = StringIO()
self.html_doc.write(feed_text)
def feed_soup(self, soup):
self.soup = soup
def reset(self):
self.soup = None
self.html_doc = None
self.tag_lineno = None
self.tag_column = None
self.parse_contents(soup.contents)
def parse_contents(self, contents):
for content in contents:
if isinstance(content, Tag):
self.tag_lineno = content.sourceline
self.tag_column = None if content.sourcepos is None \
tag_column = None if content.sourcepos is None \
else content.sourcepos + 1
if content.is_empty_element:
self.handler.start_end_element(
content.name, content.attrs, content.text.strip(),
content.sourceline, tag_column
)
else:
self.handler.start_element(
content.name, content.attrs, content.text.strip(),
content.sourceline, tag_column
)
if hasattr(content, 'contents'): # recursion
self.parse_contents(content.contents)
if hasattr(self.handler, 'end_element'):
self.handler.end_element(content.name)
def flush(self):
if self.soup is None:
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
multi_valued_attributes=None)
if hasattr(self.soup, 'contents'):
self.parse_contents(self.soup.contents)
self.encoding = self.soup.original_encoding
def lineno(self):
return self.tag_lineno
def column(self):
return self.tag_column
def parser(handler=None):
return Parser(handler)

View file

@ -17,7 +17,6 @@
"""
Handle http links.
"""
from bs4 import BeautifulSoup
import requests
# The validity of SSL certs is ignored to be able
# the check the URL and recurse into it.
@ -83,17 +82,12 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# construct parser object
handler = linkparse.MetaRobotsFinder()
parser = htmlsax.parser(handler)
handler.parser = parser
# parse
try:
parser.feed_soup(self.get_soup())
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
# break cyclic dependencies
handler.parser = None
parser.handler = None
return handler.follow
def add_size_info (self):
@ -309,9 +303,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def get_content(self):
if self.text is None:
self.get_raw_content()
self.soup = BeautifulSoup(self.data, "html.parser",
multi_valued_attributes=None,
from_encoding=self.encoding)
self.soup = htmlsax.make_soup(self.data, self.encoding)
self.text = self.data.decode(self.soup.original_encoding)
return self.text

View file

@ -41,17 +41,11 @@ import select
from io import BytesIO
from builtins import str as str_text
from future.utils import python_2_unicode_compatible
from warnings import filterwarnings
filterwarnings("ignore",
message="The soupsieve package is not installed. CSS selectors cannot be used.",
category=UserWarning, module="bs4")
from bs4 import BeautifulSoup
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
from ..HtmlParser import htmlsax
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@ -657,8 +651,7 @@ class UrlBase (object):
def get_content (self):
if self.text is None:
self.get_raw_content()
self.soup = BeautifulSoup(self.data, "html.parser",
multi_valued_attributes=None)
self.soup = htmlsax.make_soup(self.data)
self.text = self.data.decode(self.soup.original_encoding)
self.encoding = self.soup.original_encoding
return self.text

View file

@ -44,13 +44,10 @@ class FormFinder(object):
def __init__(self):
"""Initialize local variables."""
super(FormFinder, self).__init__()
# parser object will be initialized when it is used as
# a handler object
self.parser = None
self.forms = []
self.form = None
def start_element(self, tag, attrs, element_text=None):
def start_element(self, tag, attrs, element_text, lineno, column):
"""Does nothing, override in a subclass."""
if tag == u'form':
if u'action' in attrs:
@ -69,10 +66,10 @@ class FormFinder(object):
log.warn(LOG_CHECK, "formless input %s" % attrs)
pass
def start_end_element(self, tag, attrs, element_text=None):
def start_end_element(self, tag, attrs, element_text, lineno, column):
"""Delegate a combined start/end element (eg. <input .../>) to
the start_element method. Ignore the end element part."""
self.start_element(tag, attrs, element_text)
self.start_element(tag, attrs, element_text, lineno, column)
def end_element(self, tag):
"""search for ending form values."""
@ -87,13 +84,8 @@ def search_form(content, cgiuser, cgipassword):
"""
handler = FormFinder()
parser = htmlsax.parser(handler)
handler.parser = parser
# parse
parser.feed(content)
parser.flush()
# break cyclic dependencies
handler.parser = None
parser.handler = None
parser.feed_soup(htmlsax.make_soup(content))
log.debug(LOG_CHECK, "Found forms %s", handler.forms)
cginames = (cgiuser.lower(), cgipassword.lower())
for form in handler.forms:

View file

@ -104,18 +104,15 @@ class TagFinder (object):
def __init__ (self):
"""Initialize local variables."""
super(TagFinder, self).__init__()
# parser object will be initialized when it is used as
# a handler object
self.parser = None
def start_element (self, tag, attrs):
def start_element (self, tag, attrs, element_text, lineno, column):
"""Does nothing, override in a subclass."""
pass
def start_end_element (self, tag, attrs, element_text=None):
def start_end_element (self, tag, attrs, element_text, lineno, column):
"""Delegate a combined start/end element (eg. <br/>) to
the start_element method. Ignore the end element part."""
self.start_element(tag, attrs, element_text)
self.start_element(tag, attrs, element_text, lineno, column)
class MetaRobotsFinder (TagFinder):
@ -127,7 +124,7 @@ class MetaRobotsFinder (TagFinder):
log.debug(LOG_CHECK, "meta robots finder")
self.follow = self.index = True
def start_element (self, tag, attrs, element_text=None):
def start_element (self, tag, attrs, element_text, lineno, column):
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
if tag == 'meta' and attrs.get('name') == 'robots':
val = attrs.get('content', u'').lower().split(u',')
@ -177,10 +174,10 @@ class LinkFinder (TagFinder):
self.tags[tag].update(self.universal_attrs)
self.base_ref = u''
def start_element (self, tag, attrs, element_text=None):
def start_element (self, tag, attrs, element_text, lineno, column):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column())
log.debug(LOG_CHECK, "line %d col %d", lineno, column)
if tag == "base" and not self.base_ref:
self.base_ref = attrs.get("href", u'')
tagattrs = self.tags.get(tag, self.universal_attrs)
@ -205,7 +202,7 @@ class LinkFinder (TagFinder):
value = value.split(':', 1)[1]
value = 'dns:' + value.rstrip('/')
# parse tag for URLs
self.parse_tag(tag, attr, value, name, base)
self.parse_tag(tag, attr, value, name, base, lineno, column)
log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)
def get_link_name (self, tag, attrs, attr, name=None):
@ -221,7 +218,7 @@ class LinkFinder (TagFinder):
name = u""
return name
def parse_tag (self, tag, attr, value, name, base):
def parse_tag (self, tag, attr, value, name, base, lineno, column):
"""Add given url data to url list."""
assert isinstance(tag, str_text), repr(tag)
assert isinstance(attr, str_text), repr(attr)
@ -232,25 +229,24 @@ class LinkFinder (TagFinder):
if tag == u'meta' and value:
mo = refresh_re.match(value)
if mo:
self.found_url(mo.group("url"), name, base)
self.found_url(mo.group("url"), name, base, lineno, column)
elif attr != 'content':
self.found_url(value, name, base)
self.found_url(value, name, base, lineno, column)
elif attr == u'style' and value:
for mo in css_url_re.finditer(value):
url = unquote(mo.group("url"), matching=True)
self.found_url(url, name, base)
self.found_url(url, name, base, lineno, column)
elif attr == u'archive':
for url in value.split(u','):
self.found_url(url, name, base)
self.found_url(url, name, base, lineno, column)
elif attr == u'srcset':
for img_candidate in value.split(u','):
url = img_candidate.split()[0]
self.found_url(url, name, base)
self.found_url(url, name, base, lineno, column)
else:
self.found_url(value, name, base)
self.found_url(value, name, base, lineno, column)
def found_url(self, url, name, base):
def found_url(self, url, name, base, lineno, column):
"""Add newly found URL to queue."""
assert isinstance(url, str_text) or url is None, repr(url)
self.callback(url, line=self.parser.lineno(),
column=self.parser.column(), name=name, base=base)
self.callback(url, line=lineno, column=column, name=name, base=base)

View file

@ -120,18 +120,13 @@ def find_links (url_data, callback, tags):
# construct parser object
handler = linkparse.LinkFinder(callback, tags)
parser = htmlsax.parser(handler)
handler.parser = parser
# parse
try:
soup = url_data.get_soup()
parser.feed_soup(soup)
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
# break cyclic dependencies
handler.parser = None
parser.handler = None
def parse_firefox (url_data):

View file

@ -15,50 +15,12 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Default HTML parser handler classes.
HTML parser handler test class.
"""
import sys
class HtmlPrinter:
"""
Handles all functions by printing the function name and attributes.
"""
def __init__ (self, fd=sys.stdout):
"""
Write to given file descriptor.
@param fd: file like object (default=sys.stdout)
@type fd: file
"""
self.fd = fd
def _print (self, *attrs):
"""
Print function attributes to stored file descriptor.
@param attrs: list of values to print
@type attrs: tuple
@return: None
"""
self.fd.write(self.mem)
self.fd.write(str(attrs))
def __getattr__ (self, name):
"""
Remember the called method name in self.mem.
@param name: attribute name
@type name: string
@return: method which just prints out its arguments
@rtype: a bound function object
"""
self.mem = name
return self._print
class HtmlPrettyPrinter:
"""
Print out all parsed HTML data in encoded form.
@ -77,7 +39,7 @@ class HtmlPrettyPrinter:
self.fd = fd
self.encoding = encoding
def start_element (self, tag, attrs, element_text=None):
def start_element (self, tag, attrs, element_text, lineno, column):
"""
Print HTML start element.
@ -89,7 +51,7 @@ class HtmlPrettyPrinter:
"""
self._start_element(tag, attrs, ">", element_text)
def start_end_element (self, tag, attrs, element_text=None):
def start_end_element (self, tag, attrs, element_text, lineno, column):
"""
Print HTML start-end element.
@ -101,7 +63,7 @@ class HtmlPrettyPrinter:
"""
self._start_element(tag, attrs, "/>", element_text)
def _start_element (self, tag, attrs, end, element_text=None):
def _start_element (self, tag, attrs, end, element_text):
"""
Print HTML element with end string.

View file

@ -20,7 +20,7 @@ Test linkparser routines.
import unittest
from linkcheck.htmlutil import linkparse
import linkcheck.HtmlParser.htmlsax
from linkcheck.HtmlParser import htmlsax
class TestLinkparser (unittest.TestCase):
@ -31,15 +31,11 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
p = linkcheck.HtmlParser.htmlsax.parser(h)
h.parser = p
p = htmlsax.parser(h)
try:
p.feed(content)
p.flush()
p.feed_soup(htmlsax.make_soup(content))
except linkparse.StopParse:
pass
h.parser = None
p.handler = None
self.assertEqual(self.count_url, 1)
def _test_one_url (self, origurl):
@ -53,15 +49,11 @@ class TestLinkparser (unittest.TestCase):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
p = linkcheck.HtmlParser.htmlsax.parser(h)
h.parser = p
p = htmlsax.parser(h)
try:
p.feed(content)
p.flush()
p.feed_soup(htmlsax.make_soup(content))
except linkparse.StopParse:
pass
h.parser = None
p.handler = None
def test_href_parsing (self):
# Test <a href> parsing.

View file

@ -18,14 +18,14 @@
Test html parsing.
"""
import linkcheck.HtmlParser.htmlsax
from linkcheck.HtmlParser import htmlsax
from io import StringIO
import unittest
from parameterized import parameterized
from .htmllib import HtmlPrinter, HtmlPrettyPrinter
from .htmllib import HtmlPrettyPrinter
# list of tuples
# (<test pattern>, <expected parse output>)
@ -137,70 +137,23 @@ class TestParser (unittest.TestCase):
Test html parser.
"""
def setUp (self):
"""
Initialize two internal html parsers to be used for testing.
"""
self.htmlparser = linkcheck.HtmlParser.htmlsax.parser()
self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser()
@parameterized.expand(parsetests)
def test_parse (self, _in, _out):
# Parse all test patterns in one go.
out = StringIO()
handler = HtmlPrettyPrinter(out)
self.htmlparser.handler = handler
self.htmlparser.feed(_in)
self.check_results(self.htmlparser, _in, _out, out)
parser = htmlsax.parser(handler)
parser.feed_soup(htmlsax.make_soup(_in))
self.check_results(_in, _out, out)
def check_results (self, htmlparser, _in, _out, out):
def check_results (self, _in, _out, out):
"""
Check parse results.
"""
htmlparser.flush()
res = out.getvalue()
msg = "Test error; in: %r, out: %r, expect: %r" % \
(_in, res, _out)
self.assertEqual(res, _out, msg=msg)
htmlparser.reset()
@parameterized.expand(parsetests)
def test_feed (self, _in, _out):
# Parse all test patterns sequentially.
out = StringIO()
handler = HtmlPrettyPrinter(out)
self.htmlparser.handler = handler
for c in _in:
self.htmlparser.feed(c)
self.check_results(self.htmlparser, _in, _out, out)
@parameterized.expand(parsetests)
def test_interwoven (self, _in, _out):
# Parse all test patterns on two parsers interwoven.
out = StringIO()
out2 = StringIO()
handler = HtmlPrettyPrinter(out)
self.htmlparser.handler = handler
handler2 = HtmlPrettyPrinter(out2)
self.htmlparser2.handler = handler2
for c in _in:
self.htmlparser.feed(c)
self.htmlparser2.feed(c)
self.check_results(self.htmlparser, _in, _out, out)
self.check_results(self.htmlparser2, _in, _out, out2)
@parameterized.expand(parsetests)
def test_handler (self, _in, _out):
out = StringIO()
out2 = StringIO()
handler = HtmlPrinter(out)
self.htmlparser.handler = handler
handler2 = HtmlPrinter(out2)
self.htmlparser2.handler = handler2
for c in _in:
self.htmlparser.feed(c)
self.htmlparser2.feed(c)
self.assertEqual(out.getvalue(), out2.getvalue())
def test_encoding_detection_utf_content (self):
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
@ -227,11 +180,9 @@ class TestParser (unittest.TestCase):
self.encoding_test(html, "ascii")
def encoding_test (self, html, expected):
parser = linkcheck.HtmlParser.htmlsax.parser()
self.assertEqual(parser.encoding, None)
out = StringIO()
handler = HtmlPrettyPrinter(out)
parser.handler = handler
parser.feed(html)
parser.flush()
self.assertEqual(parser.encoding, expected)
parser = htmlsax.parser(handler)
soup = htmlsax.make_soup(html)
parser.feed_soup(soup)
self.assertEqual(soup.original_encoding, expected)