From 9d8d251d06f6251016fc743e8a9a960416c52d76 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 8 Apr 2020 20:03:35 +0100 Subject: [PATCH 1/5] Replace Parser lineno() and column() methods Stop storing this data in Parser object state. --- linkcheck/HtmlParser/htmlsax.py | 13 +++--------- linkcheck/checker/httpurl.py | 4 ---- linkcheck/htmlutil/formsearch.py | 13 +++--------- linkcheck/htmlutil/linkparse.py | 36 ++++++++++++++------------------ linkcheck/parser/__init__.py | 4 ---- tests/htmllib.py | 6 +++--- tests/test_linkparser.py | 6 ------ 7 files changed, 25 insertions(+), 57 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 7975b6e7..5a9f7282 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -49,22 +49,21 @@ class Parser(object): def reset(self): self.soup = None self.html_doc = None - self.tag_lineno = None - self.tag_column = None def parse_contents(self, contents): for content in contents: if isinstance(content, Tag): - self.tag_lineno = content.sourceline - self.tag_column = None if content.sourcepos is None \ + tag_column = None if content.sourcepos is None \ else content.sourcepos + 1 if content.is_empty_element: self.handler.start_end_element( content.name, content.attrs, content.text.strip(), + content.sourceline, tag_column ) else: self.handler.start_element( content.name, content.attrs, content.text.strip(), + content.sourceline, tag_column ) if hasattr(content, 'contents'): # recursion self.parse_contents(content.contents) @@ -79,12 +78,6 @@ class Parser(object): self.parse_contents(self.soup.contents) self.encoding = self.soup.original_encoding - def lineno(self): - return self.tag_lineno - - def column(self): - return self.tag_column - def parser(handler=None): return Parser(handler) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 9e6459ef..d46e814b 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -83,7 +83,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # construct parser object handler = linkparse.MetaRobotsFinder() parser = htmlsax.parser(handler) - handler.parser = parser # parse try: parser.feed_soup(self.get_soup()) @@ -91,9 +90,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass - # break cyclic dependencies - handler.parser = None - parser.handler = None return handler.follow def add_size_info (self): diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py index 9419a6c4..e530791c 100644 --- a/linkcheck/htmlutil/formsearch.py +++ b/linkcheck/htmlutil/formsearch.py @@ -44,13 +44,10 @@ class FormFinder(object): def __init__(self): """Initialize local variables.""" super(FormFinder, self).__init__() - # parser object will be initialized when it is used as - # a handler object - self.parser = None self.forms = [] self.form = None - def start_element(self, tag, attrs, element_text=None): + def start_element(self, tag, attrs, element_text, lineno, column): """Does nothing, override in a subclass.""" if tag == u'form': if u'action' in attrs: @@ -69,10 +66,10 @@ class FormFinder(object): log.warn(LOG_CHECK, "formless input %s" % attrs) pass - def start_end_element(self, tag, attrs, element_text=None): + def start_end_element(self, tag, attrs, element_text, lineno, column): """Delegate a combined start/end element (eg. ) to the start_element method. Ignore the end element part.""" - self.start_element(tag, attrs, element_text) + self.start_element(tag, attrs, element_text, lineno, column) def end_element(self, tag): """search for ending form values.""" @@ -87,13 +84,9 @@ def search_form(content, cgiuser, cgipassword): """ handler = FormFinder() parser = htmlsax.parser(handler) - handler.parser = parser # parse parser.feed(content) parser.flush() - # break cyclic dependencies - handler.parser = None - parser.handler = None log.debug(LOG_CHECK, "Found forms %s", handler.forms) cginames = (cgiuser.lower(), cgipassword.lower()) for form in handler.forms: diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index e5295817..b2ed61e6 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -104,18 +104,15 @@ class TagFinder (object): def __init__ (self): """Initialize local variables.""" super(TagFinder, self).__init__() - # parser object will be initialized when it is used as - # a handler object - self.parser = None - def start_element (self, tag, attrs): + def start_element (self, tag, attrs, element_text, lineno, column): """Does nothing, override in a subclass.""" pass - def start_end_element (self, tag, attrs, element_text=None): + def start_end_element (self, tag, attrs, element_text, lineno, column): """Delegate a combined start/end element (eg.
) to the start_element method. Ignore the end element part.""" - self.start_element(tag, attrs, element_text) + self.start_element(tag, attrs, element_text, lineno, column) class MetaRobotsFinder (TagFinder): @@ -127,7 +124,7 @@ class MetaRobotsFinder (TagFinder): log.debug(LOG_CHECK, "meta robots finder") self.follow = self.index = True - def start_element (self, tag, attrs, element_text=None): + def start_element (self, tag, attrs, element_text, lineno, column): """Search for meta robots.txt "nofollow" and "noindex" flags.""" if tag == 'meta' and attrs.get('name') == 'robots': val = attrs.get('content', u'').lower().split(u',') @@ -177,10 +174,10 @@ class LinkFinder (TagFinder): self.tags[tag].update(self.universal_attrs) self.base_ref = u'' - def start_element (self, tag, attrs, element_text=None): + def start_element (self, tag, attrs, element_text, lineno, column): """Search for links and store found URLs in a list.""" log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) - log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column()) + log.debug(LOG_CHECK, "line %d col %d", lineno, column) if tag == "base" and not self.base_ref: self.base_ref = attrs.get("href", u'') tagattrs = self.tags.get(tag, self.universal_attrs) @@ -205,7 +202,7 @@ class LinkFinder (TagFinder): value = value.split(':', 1)[1] value = 'dns:' + value.rstrip('/') # parse tag for URLs - self.parse_tag(tag, attr, value, name, base) + self.parse_tag(tag, attr, value, name, base, lineno, column) log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag) def get_link_name (self, tag, attrs, attr, name=None): @@ -221,7 +218,7 @@ class LinkFinder (TagFinder): name = u"" return name - def parse_tag (self, tag, attr, value, name, base): + def parse_tag (self, tag, attr, value, name, base, lineno, column): """Add given url data to url list.""" assert isinstance(tag, str_text), repr(tag) assert isinstance(attr, str_text), repr(attr) @@ -232,25 +229,24 @@ class LinkFinder (TagFinder): if tag == u'meta' and value: mo = refresh_re.match(value) if mo: - self.found_url(mo.group("url"), name, base) + self.found_url(mo.group("url"), name, base, lineno, column) elif attr != 'content': - self.found_url(value, name, base) + self.found_url(value, name, base, lineno, column) elif attr == u'style' and value: for mo in css_url_re.finditer(value): url = unquote(mo.group("url"), matching=True) - self.found_url(url, name, base) + self.found_url(url, name, base, lineno, column) elif attr == u'archive': for url in value.split(u','): - self.found_url(url, name, base) + self.found_url(url, name, base, lineno, column) elif attr == u'srcset': for img_candidate in value.split(u','): url = img_candidate.split()[0] - self.found_url(url, name, base) + self.found_url(url, name, base, lineno, column) else: - self.found_url(value, name, base) + self.found_url(value, name, base, lineno, column) - def found_url(self, url, name, base): + def found_url(self, url, name, base, lineno, column): """Add newly found URL to queue.""" assert isinstance(url, str_text) or url is None, repr(url) - self.callback(url, line=self.parser.lineno(), - column=self.parser.column(), name=name, base=base) + self.callback(url, line=lineno, column=column, name=name, base=base) diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index dc3494fb..c9bf471f 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -120,7 +120,6 @@ def find_links (url_data, callback, tags): # construct parser object handler = linkparse.LinkFinder(callback, tags) parser = htmlsax.parser(handler) - handler.parser = parser # parse try: soup = url_data.get_soup() @@ -129,9 +128,6 @@ def find_links (url_data, callback, tags): except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass - # break cyclic dependencies - handler.parser = None - parser.handler = None def parse_firefox (url_data): diff --git a/tests/htmllib.py b/tests/htmllib.py index 6f1c5b19..a42c4e6e 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -77,7 +77,7 @@ class HtmlPrettyPrinter: self.fd = fd self.encoding = encoding - def start_element (self, tag, attrs, element_text=None): + def start_element (self, tag, attrs, element_text, lineno, column): """ Print HTML start element. @@ -89,7 +89,7 @@ class HtmlPrettyPrinter: """ self._start_element(tag, attrs, ">", element_text) - def start_end_element (self, tag, attrs, element_text=None): + def start_end_element (self, tag, attrs, element_text, lineno, column): """ Print HTML start-end element. @@ -101,7 +101,7 @@ class HtmlPrettyPrinter: """ self._start_element(tag, attrs, "/>", element_text) - def _start_element (self, tag, attrs, end, element_text=None): + def _start_element (self, tag, attrs, end, element_text): """ Print HTML element with end string. diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index 0b965af2..a6d1f9b4 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -32,14 +32,11 @@ class TestLinkparser (unittest.TestCase): self.count_url = 0 h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags) p = linkcheck.HtmlParser.htmlsax.parser(h) - h.parser = p try: p.feed(content) p.flush() except linkparse.StopParse: pass - h.parser = None - p.handler = None self.assertEqual(self.count_url, 1) def _test_one_url (self, origurl): @@ -54,14 +51,11 @@ class TestLinkparser (unittest.TestCase): self.assertTrue(False, 'URL %r found' % url) h = linkparse.LinkFinder(callback, linkparse.LinkTags) p = linkcheck.HtmlParser.htmlsax.parser(h) - h.parser = p try: p.feed(content) p.flush() except linkparse.StopParse: pass - h.parser = None - p.handler = None def test_href_parsing (self): # Test parsing. From 40f43ae41cfd03bbb8df566e4c275bee16b209b7 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 8 Apr 2020 20:03:35 +0100 Subject: [PATCH 2/5] Create one function to make soup objects --- linkcheck/HtmlParser/htmlsax.py | 4 ++++ linkcheck/checker/httpurl.py | 5 +---- linkcheck/checker/urlbase.py | 11 ++--------- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 5a9f7282..dac2e19d 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -27,6 +27,10 @@ filterwarnings("ignore", from bs4 import BeautifulSoup, Tag +def make_soup(markup, from_encoding=None): + return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding, + multi_valued_attributes=None) + class Parser(object): handler = None encoding = None diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index d46e814b..53cce694 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -17,7 +17,6 @@ """ Handle http links. """ -from bs4 import BeautifulSoup import requests # The validity of SSL certs is ignored to be able # the check the URL and recurse into it. @@ -305,9 +304,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): def get_content(self): if self.text is None: self.get_raw_content() - self.soup = BeautifulSoup(self.data, "html.parser", - multi_valued_attributes=None, - from_encoding=self.encoding) + self.soup = htmlsax.make_soup(self.data, self.encoding) self.text = self.data.decode(self.soup.original_encoding) return self.text diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index ca924ad3..bb7debef 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -41,17 +41,11 @@ import select from io import BytesIO from builtins import str as str_text from future.utils import python_2_unicode_compatible -from warnings import filterwarnings - -filterwarnings("ignore", - message="The soupsieve package is not installed. CSS selectors cannot be used.", - category=UserWarning, module="bs4") - -from bs4 import BeautifulSoup from . import absolute_url, get_url_from from .. import (log, LOG_CHECK, strformat, LinkCheckerError, url as urlutil, trace, get_link_pat) +from ..HtmlParser import htmlsax from ..network import iputil from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP, @@ -657,8 +651,7 @@ class UrlBase (object): def get_content (self): if self.text is None: self.get_raw_content() - self.soup = BeautifulSoup(self.data, "html.parser", - multi_valued_attributes=None) + self.soup = htmlsax.make_soup(self.data) self.text = self.data.decode(self.soup.original_encoding) self.encoding = self.soup.original_encoding return self.text From 3771dd913671b2fceb5b36b19342dacaa4bbf1ea Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 8 Apr 2020 20:03:35 +0100 Subject: [PATCH 3/5] Use parser.feed_soup() instead of parser.feed() Markup is not being passed in pieces to the parser, so simplify the interface and reduce the state further. --- linkcheck/HtmlParser/htmlsax.py | 13 ------- linkcheck/htmlutil/formsearch.py | 2 +- tests/htmllib.py | 40 +-------------------- tests/test_linkparser.py | 10 +++--- tests/test_parser.py | 61 +++++--------------------------- 5 files changed, 15 insertions(+), 111 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index dac2e19d..768dc047 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -17,7 +17,6 @@ HTML parser implemented using Beautiful Soup and html.parser. """ -from io import BytesIO, StringIO from warnings import filterwarnings filterwarnings("ignore", @@ -39,20 +38,11 @@ class Parser(object): self.handler = handler self.reset() - def feed(self, feed_text): - if not self.html_doc: - if isinstance(feed_text, bytes): - self.html_doc = BytesIO() - else: - self.html_doc = StringIO() - self.html_doc.write(feed_text) - def feed_soup(self, soup): self.soup = soup def reset(self): self.soup = None - self.html_doc = None def parse_contents(self, contents): for content in contents: @@ -75,9 +65,6 @@ class Parser(object): self.handler.end_element(content.name) def flush(self): - if self.soup is None: - self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser', - multi_valued_attributes=None) if hasattr(self.soup, 'contents'): self.parse_contents(self.soup.contents) self.encoding = self.soup.original_encoding diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py index e530791c..d54313f2 100644 --- a/linkcheck/htmlutil/formsearch.py +++ b/linkcheck/htmlutil/formsearch.py @@ -85,7 +85,7 @@ def search_form(content, cgiuser, cgipassword): handler = FormFinder() parser = htmlsax.parser(handler) # parse - parser.feed(content) + parser.feed_soup(htmlsax.make_soup(content)) parser.flush() log.debug(LOG_CHECK, "Found forms %s", handler.forms) cginames = (cgiuser.lower(), cgipassword.lower()) diff --git a/tests/htmllib.py b/tests/htmllib.py index a42c4e6e..ab16ac46 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -15,50 +15,12 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ -Default HTML parser handler classes. +HTML parser handler test class. """ import sys -class HtmlPrinter: - """ - Handles all functions by printing the function name and attributes. - """ - - def __init__ (self, fd=sys.stdout): - """ - Write to given file descriptor. - - @param fd: file like object (default=sys.stdout) - @type fd: file - """ - self.fd = fd - - def _print (self, *attrs): - """ - Print function attributes to stored file descriptor. - - @param attrs: list of values to print - @type attrs: tuple - @return: None - """ - self.fd.write(self.mem) - self.fd.write(str(attrs)) - - def __getattr__ (self, name): - """ - Remember the called method name in self.mem. - - @param name: attribute name - @type name: string - @return: method which just prints out its arguments - @rtype: a bound function object - """ - self.mem = name - return self._print - - class HtmlPrettyPrinter: """ Print out all parsed HTML data in encoded form. diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index a6d1f9b4..9dcbb0c2 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -20,7 +20,7 @@ Test linkparser routines. import unittest from linkcheck.htmlutil import linkparse -import linkcheck.HtmlParser.htmlsax +from linkcheck.HtmlParser import htmlsax class TestLinkparser (unittest.TestCase): @@ -31,9 +31,9 @@ class TestLinkparser (unittest.TestCase): def _test_one_link (self, content, url): self.count_url = 0 h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags) - p = linkcheck.HtmlParser.htmlsax.parser(h) + p = htmlsax.parser(h) try: - p.feed(content) + p.feed_soup(htmlsax.make_soup(content)) p.flush() except linkparse.StopParse: pass @@ -50,9 +50,9 @@ class TestLinkparser (unittest.TestCase): def callback (url, line, column, name, base): self.assertTrue(False, 'URL %r found' % url) h = linkparse.LinkFinder(callback, linkparse.LinkTags) - p = linkcheck.HtmlParser.htmlsax.parser(h) + p = htmlsax.parser(h) try: - p.feed(content) + p.feed_soup(htmlsax.make_soup(content)) p.flush() except linkparse.StopParse: pass diff --git a/tests/test_parser.py b/tests/test_parser.py index fc831361..109f21fa 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,14 +18,14 @@ Test html parsing. """ -import linkcheck.HtmlParser.htmlsax +from linkcheck.HtmlParser import htmlsax from io import StringIO import unittest from parameterized import parameterized -from .htmllib import HtmlPrinter, HtmlPrettyPrinter +from .htmllib import HtmlPrettyPrinter # list of tuples # (, ) @@ -137,21 +137,14 @@ class TestParser (unittest.TestCase): Test html parser. """ - def setUp (self): - """ - Initialize two internal html parsers to be used for testing. - """ - self.htmlparser = linkcheck.HtmlParser.htmlsax.parser() - self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser() - @parameterized.expand(parsetests) def test_parse (self, _in, _out): # Parse all test patterns in one go. out = StringIO() handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - self.htmlparser.feed(_in) - self.check_results(self.htmlparser, _in, _out, out) + parser = htmlsax.parser(handler) + parser.feed_soup(htmlsax.make_soup(_in)) + self.check_results(_in, _out, out) def check_results (self, htmlparser, _in, _out, out): """ @@ -164,44 +157,6 @@ class TestParser (unittest.TestCase): self.assertEqual(res, _out, msg=msg) htmlparser.reset() - @parameterized.expand(parsetests) - def test_feed (self, _in, _out): - # Parse all test patterns sequentially. - out = StringIO() - handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - for c in _in: - self.htmlparser.feed(c) - self.check_results(self.htmlparser, _in, _out, out) - - @parameterized.expand(parsetests) - def test_interwoven (self, _in, _out): - # Parse all test patterns on two parsers interwoven. - out = StringIO() - out2 = StringIO() - handler = HtmlPrettyPrinter(out) - self.htmlparser.handler = handler - handler2 = HtmlPrettyPrinter(out2) - self.htmlparser2.handler = handler2 - for c in _in: - self.htmlparser.feed(c) - self.htmlparser2.feed(c) - self.check_results(self.htmlparser, _in, _out, out) - self.check_results(self.htmlparser2, _in, _out, out2) - - @parameterized.expand(parsetests) - def test_handler (self, _in, _out): - out = StringIO() - out2 = StringIO() - handler = HtmlPrinter(out) - self.htmlparser.handler = handler - handler2 = HtmlPrinter(out2) - self.htmlparser2.handler = handler2 - for c in _in: - self.htmlparser.feed(c) - self.htmlparser2.feed(c) - self.assertEqual(out.getvalue(), out2.getvalue()) - def test_encoding_detection_utf_content (self): html = b'' self.encoding_test(html, "utf-8") @@ -227,11 +182,11 @@ class TestParser (unittest.TestCase): self.encoding_test(html, "ascii") def encoding_test (self, html, expected): - parser = linkcheck.HtmlParser.htmlsax.parser() + parser = htmlsax.parser() self.assertEqual(parser.encoding, None) out = StringIO() handler = HtmlPrettyPrinter(out) - parser.handler = handler - parser.feed(html) + parser = htmlsax.parser(handler) + parser.feed_soup(htmlsax.make_soup(html)) parser.flush() self.assertEqual(parser.encoding, expected) From 02e1c389b2c10ad3124fb2e2a8b021d41267772e Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 8 Apr 2020 20:03:35 +0100 Subject: [PATCH 4/5] Remove parser flush() and reset() Remnants of the feed() interface. --- linkcheck/HtmlParser/htmlsax.py | 12 ++---------- linkcheck/checker/httpurl.py | 1 - linkcheck/htmlutil/formsearch.py | 1 - linkcheck/parser/__init__.py | 1 - tests/test_linkparser.py | 2 -- tests/test_parser.py | 5 +---- 6 files changed, 3 insertions(+), 19 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 768dc047..b4c6f460 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -36,13 +36,10 @@ class Parser(object): def __init__(self, handler): self.handler = handler - self.reset() def feed_soup(self, soup): - self.soup = soup - - def reset(self): - self.soup = None + self.parse_contents(soup.contents) + self.encoding = soup.original_encoding def parse_contents(self, contents): for content in contents: @@ -64,11 +61,6 @@ class Parser(object): if hasattr(self.handler, 'end_element'): self.handler.end_element(content.name) - def flush(self): - if hasattr(self.soup, 'contents'): - self.parse_contents(self.soup.contents) - self.encoding = self.soup.original_encoding - def parser(handler=None): return Parser(handler) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 53cce694..46dab657 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -85,7 +85,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # parse try: parser.feed_soup(self.get_soup()) - parser.flush() except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py index d54313f2..eca99ed6 100644 --- a/linkcheck/htmlutil/formsearch.py +++ b/linkcheck/htmlutil/formsearch.py @@ -86,7 +86,6 @@ def search_form(content, cgiuser, cgipassword): parser = htmlsax.parser(handler) # parse parser.feed_soup(htmlsax.make_soup(content)) - parser.flush() log.debug(LOG_CHECK, "Found forms %s", handler.forms) cginames = (cgiuser.lower(), cgipassword.lower()) for form in handler.forms: diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index c9bf471f..b35892a8 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -124,7 +124,6 @@ def find_links (url_data, callback, tags): try: soup = url_data.get_soup() parser.feed_soup(soup) - parser.flush() except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index 9dcbb0c2..e0962f7b 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -34,7 +34,6 @@ class TestLinkparser (unittest.TestCase): p = htmlsax.parser(h) try: p.feed_soup(htmlsax.make_soup(content)) - p.flush() except linkparse.StopParse: pass self.assertEqual(self.count_url, 1) @@ -53,7 +52,6 @@ class TestLinkparser (unittest.TestCase): p = htmlsax.parser(h) try: p.feed_soup(htmlsax.make_soup(content)) - p.flush() except linkparse.StopParse: pass diff --git a/tests/test_parser.py b/tests/test_parser.py index 109f21fa..42c048e1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -146,16 +146,14 @@ class TestParser (unittest.TestCase): parser.feed_soup(htmlsax.make_soup(_in)) self.check_results(_in, _out, out) - def check_results (self, htmlparser, _in, _out, out): + def check_results (self, _in, _out, out): """ Check parse results. """ - htmlparser.flush() res = out.getvalue() msg = "Test error; in: %r, out: %r, expect: %r" % \ (_in, res, _out) self.assertEqual(res, _out, msg=msg) - htmlparser.reset() def test_encoding_detection_utf_content (self): html = b'' @@ -188,5 +186,4 @@ class TestParser (unittest.TestCase): handler = HtmlPrettyPrinter(out) parser = htmlsax.parser(handler) parser.feed_soup(htmlsax.make_soup(html)) - parser.flush() self.assertEqual(parser.encoding, expected) From 974915cc4f733fb314a8b8fd001e8abcf56ac9ef Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 8 Apr 2020 20:03:35 +0100 Subject: [PATCH 5/5] Remove encoding from Parser Only used by the test and an attribute of the soup object. --- linkcheck/HtmlParser/htmlsax.py | 2 -- tests/test_parser.py | 7 +++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index b4c6f460..df35d722 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -32,14 +32,12 @@ def make_soup(markup, from_encoding=None): class Parser(object): handler = None - encoding = None def __init__(self, handler): self.handler = handler def feed_soup(self, soup): self.parse_contents(soup.contents) - self.encoding = soup.original_encoding def parse_contents(self, contents): for content in contents: diff --git a/tests/test_parser.py b/tests/test_parser.py index 42c048e1..7e087082 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -180,10 +180,9 @@ class TestParser (unittest.TestCase): self.encoding_test(html, "ascii") def encoding_test (self, html, expected): - parser = htmlsax.parser() - self.assertEqual(parser.encoding, None) out = StringIO() handler = HtmlPrettyPrinter(out) parser = htmlsax.parser(handler) - parser.feed_soup(htmlsax.make_soup(html)) - self.assertEqual(parser.encoding, expected) + soup = htmlsax.make_soup(html) + parser.feed_soup(soup) + self.assertEqual(soup.original_encoding, expected)