diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 7975b6e7..df35d722 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -17,7 +17,6 @@ HTML parser implemented using Beautiful Soup and html.parser. """ -from io import BytesIO, StringIO from warnings import filterwarnings filterwarnings("ignore", @@ -27,64 +26,39 @@ filterwarnings("ignore", from bs4 import BeautifulSoup, Tag +def make_soup(markup, from_encoding=None): + return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding, + multi_valued_attributes=None) + class Parser(object): handler = None - encoding = None def __init__(self, handler): self.handler = handler - self.reset() - - def feed(self, feed_text): - if not self.html_doc: - if isinstance(feed_text, bytes): - self.html_doc = BytesIO() - else: - self.html_doc = StringIO() - self.html_doc.write(feed_text) def feed_soup(self, soup): - self.soup = soup - - def reset(self): - self.soup = None - self.html_doc = None - self.tag_lineno = None - self.tag_column = None + self.parse_contents(soup.contents) def parse_contents(self, contents): for content in contents: if isinstance(content, Tag): - self.tag_lineno = content.sourceline - self.tag_column = None if content.sourcepos is None \ + tag_column = None if content.sourcepos is None \ else content.sourcepos + 1 if content.is_empty_element: self.handler.start_end_element( content.name, content.attrs, content.text.strip(), + content.sourceline, tag_column ) else: self.handler.start_element( content.name, content.attrs, content.text.strip(), + content.sourceline, tag_column ) if hasattr(content, 'contents'): # recursion self.parse_contents(content.contents) if hasattr(self.handler, 'end_element'): self.handler.end_element(content.name) - def flush(self): - if self.soup is None: - self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser', - multi_valued_attributes=None) - if hasattr(self.soup, 'contents'): - self.parse_contents(self.soup.contents) - self.encoding = self.soup.original_encoding - - def lineno(self): - return self.tag_lineno - - def column(self): - return self.tag_column - def parser(handler=None): return Parser(handler) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 9e6459ef..46dab657 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -17,7 +17,6 @@ """ Handle http links. """ -from bs4 import BeautifulSoup import requests # The validity of SSL certs is ignored to be able # the check the URL and recurse into it. @@ -83,17 +82,12 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # construct parser object handler = linkparse.MetaRobotsFinder() parser = htmlsax.parser(handler) - handler.parser = parser # parse try: parser.feed_soup(self.get_soup()) - parser.flush() except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass - # break cyclic dependencies - handler.parser = None - parser.handler = None return handler.follow def add_size_info (self): @@ -309,9 +303,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): def get_content(self): if self.text is None: self.get_raw_content() - self.soup = BeautifulSoup(self.data, "html.parser", - multi_valued_attributes=None, - from_encoding=self.encoding) + self.soup = htmlsax.make_soup(self.data, self.encoding) self.text = self.data.decode(self.soup.original_encoding) return self.text diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index ca924ad3..bb7debef 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -41,17 +41,11 @@ import select from io import BytesIO from builtins import str as str_text from future.utils import python_2_unicode_compatible -from warnings import filterwarnings - -filterwarnings("ignore", - message="The soupsieve package is not installed. CSS selectors cannot be used.", - category=UserWarning, module="bs4") - -from bs4 import BeautifulSoup from . import absolute_url, get_url_from from .. import (log, LOG_CHECK, strformat, LinkCheckerError, url as urlutil, trace, get_link_pat) +from ..HtmlParser import htmlsax from ..network import iputil from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP, @@ -657,8 +651,7 @@ class UrlBase (object): def get_content (self): if self.text is None: self.get_raw_content() - self.soup = BeautifulSoup(self.data, "html.parser", - multi_valued_attributes=None) + self.soup = htmlsax.make_soup(self.data) self.text = self.data.decode(self.soup.original_encoding) self.encoding = self.soup.original_encoding return self.text diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py index 9419a6c4..eca99ed6 100644 --- a/linkcheck/htmlutil/formsearch.py +++ b/linkcheck/htmlutil/formsearch.py @@ -44,13 +44,10 @@ class FormFinder(object): def __init__(self): """Initialize local variables.""" super(FormFinder, self).__init__() - # parser object will be initialized when it is used as - # a handler object - self.parser = None self.forms = [] self.form = None - def start_element(self, tag, attrs, element_text=None): + def start_element(self, tag, attrs, element_text, lineno, column): """Does nothing, override in a subclass.""" if tag == u'form': if u'action' in attrs: @@ -69,10 +66,10 @@ class FormFinder(object): log.warn(LOG_CHECK, "formless input %s" % attrs) pass - def start_end_element(self, tag, attrs, element_text=None): + def start_end_element(self, tag, attrs, element_text, lineno, column): """Delegate a combined start/end element (eg. ) to the start_element method. Ignore the end element part.""" - self.start_element(tag, attrs, element_text) + self.start_element(tag, attrs, element_text, lineno, column) def end_element(self, tag): """search for ending form values.""" @@ -87,13 +84,8 @@ def search_form(content, cgiuser, cgipassword): """ handler = FormFinder() parser = htmlsax.parser(handler) - handler.parser = parser # parse - parser.feed(content) - parser.flush() - # break cyclic dependencies - handler.parser = None - parser.handler = None + parser.feed_soup(htmlsax.make_soup(content)) log.debug(LOG_CHECK, "Found forms %s", handler.forms) cginames = (cgiuser.lower(), cgipassword.lower()) for form in handler.forms: diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index e5295817..b2ed61e6 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -104,18 +104,15 @@ class TagFinder (object): def __init__ (self): """Initialize local variables.""" super(TagFinder, self).__init__() - # parser object will be initialized when it is used as - # a handler object - self.parser = None - def start_element (self, tag, attrs): + def start_element (self, tag, attrs, element_text, lineno, column): """Does nothing, override in a subclass.""" pass - def start_end_element (self, tag, attrs, element_text=None): + def start_end_element (self, tag, attrs, element_text, lineno, column): """Delegate a combined start/end element (eg.
) to the start_element method. Ignore the end element part.""" - self.start_element(tag, attrs, element_text) + self.start_element(tag, attrs, element_text, lineno, column) class MetaRobotsFinder (TagFinder): @@ -127,7 +124,7 @@ class MetaRobotsFinder (TagFinder): log.debug(LOG_CHECK, "meta robots finder") self.follow = self.index = True - def start_element (self, tag, attrs, element_text=None): + def start_element (self, tag, attrs, element_text, lineno, column): """Search for meta robots.txt "nofollow" and "noindex" flags.""" if tag == 'meta' and attrs.get('name') == 'robots': val = attrs.get('content', u'').lower().split(u',') @@ -177,10 +174,10 @@ class LinkFinder (TagFinder): self.tags[tag].update(self.universal_attrs) self.base_ref = u'' - def start_element (self, tag, attrs, element_text=None): + def start_element (self, tag, attrs, element_text, lineno, column): """Search for links and store found URLs in a list.""" log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) - log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column()) + log.debug(LOG_CHECK, "line %d col %d", lineno, column) if tag == "base" and not self.base_ref: self.base_ref = attrs.get("href", u'') tagattrs = self.tags.get(tag, self.universal_attrs) @@ -205,7 +202,7 @@ class LinkFinder (TagFinder): value = value.split(':', 1)[1] value = 'dns:' + value.rstrip('/') # parse tag for URLs - self.parse_tag(tag, attr, value, name, base) + self.parse_tag(tag, attr, value, name, base, lineno, column) log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag) def get_link_name (self, tag, attrs, attr, name=None): @@ -221,7 +218,7 @@ class LinkFinder (TagFinder): name = u"" return name - def parse_tag (self, tag, attr, value, name, base): + def parse_tag (self, tag, attr, value, name, base, lineno, column): """Add given url data to url list.""" assert isinstance(tag, str_text), repr(tag) assert isinstance(attr, str_text), repr(attr) @@ -232,25 +229,24 @@ class LinkFinder (TagFinder): if tag == u'meta' and value: mo = refresh_re.match(value) if mo: - self.found_url(mo.group("url"), name, base) + self.found_url(mo.group("url"), name, base, lineno, column) elif attr != 'content': - self.found_url(value, name, base) + self.found_url(value, name, base, lineno, column) elif attr == u'style' and value: for mo in css_url_re.finditer(value): url = unquote(mo.group("url"), matching=True) - self.found_url(url, name, base) + self.found_url(url, name, base, lineno, column) elif attr == u'archive': for url in value.split(u','): - self.found_url(url, name, base) + self.found_url(url, name, base, lineno, column) elif attr == u'srcset': for img_candidate in value.split(u','): url = img_candidate.split()[0] - self.found_url(url, name, base) + self.found_url(url, name, base, lineno, column) else: - self.found_url(value, name, base) + self.found_url(value, name, base, lineno, column) - def found_url(self, url, name, base): + def found_url(self, url, name, base, lineno, column): """Add newly found URL to queue.""" assert isinstance(url, str_text) or url is None, repr(url) - self.callback(url, line=self.parser.lineno(), - column=self.parser.column(), name=name, base=base) + self.callback(url, line=lineno, column=column, name=name, base=base) diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index dc3494fb..b35892a8 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -120,18 +120,13 @@ def find_links (url_data, callback, tags): # construct parser object handler = linkparse.LinkFinder(callback, tags) parser = htmlsax.parser(handler) - handler.parser = parser # parse try: soup = url_data.get_soup() parser.feed_soup(soup) - parser.flush() except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass - # break cyclic dependencies - handler.parser = None - parser.handler = None def parse_firefox (url_data): diff --git a/tests/htmllib.py b/tests/htmllib.py index 6f1c5b19..ab16ac46 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -15,50 +15,12 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ -Default HTML parser handler classes. +HTML parser handler test class. """ import sys -class HtmlPrinter: - """ - Handles all functions by printing the function name and attributes. - """ - - def __init__ (self, fd=sys.stdout): - """ - Write to given file descriptor. - - @param fd: file like object (default=sys.stdout) - @type fd: file - """ - self.fd = fd - - def _print (self, *attrs): - """ - Print function attributes to stored file descriptor. - - @param attrs: list of values to print - @type attrs: tuple - @return: None - """ - self.fd.write(self.mem) - self.fd.write(str(attrs)) - - def __getattr__ (self, name): - """ - Remember the called method name in self.mem. - - @param name: attribute name - @type name: string - @return: method which just prints out its arguments - @rtype: a bound function object - """ - self.mem = name - return self._print - - class HtmlPrettyPrinter: """ Print out all parsed HTML data in encoded form. @@ -77,7 +39,7 @@ class HtmlPrettyPrinter: self.fd = fd self.encoding = encoding - def start_element (self, tag, attrs, element_text=None): + def start_element (self, tag, attrs, element_text, lineno, column): """ Print HTML start element. @@ -89,7 +51,7 @@ class HtmlPrettyPrinter: """ self._start_element(tag, attrs, ">", element_text) - def start_end_element (self, tag, attrs, element_text=None): + def start_end_element (self, tag, attrs, element_text, lineno, column): """ Print HTML start-end element. @@ -101,7 +63,7 @@ class HtmlPrettyPrinter: """ self._start_element(tag, attrs, "/>", element_text) - def _start_element (self, tag, attrs, end, element_text=None): + def _start_element (self, tag, attrs, end, element_text): """ Print HTML element with end string. diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index 0b965af2..e0962f7b 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -20,7 +20,7 @@ Test linkparser routines. import unittest from linkcheck.htmlutil import linkparse -import linkcheck.HtmlParser.htmlsax +from linkcheck.HtmlParser import htmlsax class TestLinkparser (unittest.TestCase): @@ -31,15 +31,11 @@ class TestLinkparser (unittest.TestCase): def _test_one_link (self, content, url): self.count_url = 0 h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags) - p = linkcheck.HtmlParser.htmlsax.parser(h) - h.parser = p + p = htmlsax.parser(h) try: - p.feed(content) - p.flush() + p.feed_soup(htmlsax.make_soup(content)) except linkparse.StopParse: pass - h.parser = None - p.handler = None self.assertEqual(self.count_url, 1) def _test_one_url (self, origurl): @@ -53,15 +49,11 @@ class TestLinkparser (unittest.TestCase): def callback (url, line, column, name, base): self.assertTrue(False, 'URL %r found' % url) h = linkparse.LinkFinder(callback, linkparse.LinkTags) - p = linkcheck.HtmlParser.htmlsax.parser(h) - h.parser = p + p = htmlsax.parser(h) try: - p.feed(content) - p.flush() + p.feed_soup(htmlsax.make_soup(content)) except linkparse.StopParse: pass - h.parser = None - p.handler = None def test_href_parsing (self): # Test parsing. diff --git a/tests/test_parser.py b/tests/test_parser.py index fc831361..7e087082 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,14 +18,14 @@ Test html parsing. """ -import linkcheck.HtmlParser.htmlsax +from linkcheck.HtmlParser import htmlsax from io import StringIO import unittest from parameterized import parameterized -from .htmllib import HtmlPrinter, HtmlPrettyPrinter +from .htmllib import HtmlPrettyPrinter # list of tuples # (, ) @@ -137,70 +137,23 @@ class TestParser (unittest.TestCase): Test html parser. """ - def setUp (self): - """ - Initialize two internal html parsers to be used for testing. - """ - self.htmlparser = linkcheck.HtmlParser.htmlsax.parser() - self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser() - @parameterized.expand(parsetests) def test_parse (self, _in, _out): # Parse all test patterns in one go. out = StringIO() handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - self.htmlparser.feed(_in) - self.check_results(self.htmlparser, _in, _out, out) + parser = htmlsax.parser(handler) + parser.feed_soup(htmlsax.make_soup(_in)) + self.check_results(_in, _out, out) - def check_results (self, htmlparser, _in, _out, out): + def check_results (self, _in, _out, out): """ Check parse results. """ - htmlparser.flush() res = out.getvalue() msg = "Test error; in: %r, out: %r, expect: %r" % \ (_in, res, _out) self.assertEqual(res, _out, msg=msg) - htmlparser.reset() - - @parameterized.expand(parsetests) - def test_feed (self, _in, _out): - # Parse all test patterns sequentially. - out = StringIO() - handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - for c in _in: - self.htmlparser.feed(c) - self.check_results(self.htmlparser, _in, _out, out) - - @parameterized.expand(parsetests) - def test_interwoven (self, _in, _out): - # Parse all test patterns on two parsers interwoven. - out = StringIO() - out2 = StringIO() - handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - handler2 = HtmlPrettyPrinter(out2) - self.htmlparser2.handler = handler2 - for c in _in: - self.htmlparser.feed(c) - self.htmlparser2.feed(c) - self.check_results(self.htmlparser, _in, _out, out) - self.check_results(self.htmlparser2, _in, _out, out2) - - @parameterized.expand(parsetests) - def test_handler (self, _in, _out): - out = StringIO() - out2 = StringIO() - handler = HtmlPrinter(out) - self.htmlparser.handler = handler - handler2 = HtmlPrinter(out2) - self.htmlparser2.handler = handler2 - for c in _in: - self.htmlparser.feed(c) - self.htmlparser2.feed(c) - self.assertEqual(out.getvalue(), out2.getvalue()) def test_encoding_detection_utf_content (self): html = b'' @@ -227,11 +180,9 @@ class TestParser (unittest.TestCase): self.encoding_test(html, "ascii") def encoding_test (self, html, expected): - parser = linkcheck.HtmlParser.htmlsax.parser() - self.assertEqual(parser.encoding, None) out = StringIO() handler = HtmlPrettyPrinter(out) - parser.handler = handler - parser.feed(html) - parser.flush() - self.assertEqual(parser.encoding, expected) + parser = htmlsax.parser(handler) + soup = htmlsax.make_soup(html) + parser.feed_soup(soup) + self.assertEqual(soup.original_encoding, expected)