Reuse soup object used for detecting encoding in the HTML parser

2026-04-28 10:04:43 +00:00 · 2019-10-05 19:38:57 +01:00 · 2019-10-05 19:38:57 +01:00 · 153e53ba03
commit 153e53ba03
parent 978042a54e
4 changed files with 22 additions and 10 deletions
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@ -46,7 +46,11 @@ class Parser(object):
                self.html_doc = StringIO()
        self.html_doc.write(feed_text)

+    def feed_soup(self, soup):
+        self.soup = soup
+
    def reset(self):
+        self.soup = None
        self.html_doc = None
        self.tag_lineno = None
        self.tag_column = None
@ -104,10 +108,11 @@ class Parser(object):
                    self.handler.characters(content)

    def flush(self):
-        soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
-        if hasattr(soup, 'contents'):
-            self.parse_contents(soup.contents)
-        self.encoding = soup.original_encoding
+        if self.soup is None:
+            self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
+        if hasattr(self.soup, 'contents'):
+            self.parse_contents(self.soup.contents)
+        self.encoding = self.soup.original_encoding

    def debug(self, text):
        raise NotImplementedError("debug is not implemented")
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -86,7 +86,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        handler.parser = parser
        # parse
        try:
-            parser.feed(self.get_raw_content())
+            parser.feed_soup(self.get_soup())
            parser.flush()
        except linkparse.StopParse as msg:
            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -221,6 +221,8 @@ class UrlBase (object):
        self.data = None
        # url content as a Unicode string
        self.text = None
+        # url content as a Beautiful Soup object
+        self.soup = None
        # cache url is set by build_url() calling set_cache_url()
        self.cache_url = None
        # extern flags (is_extern, is_strict)
@ -643,6 +645,11 @@ class UrlBase (object):
            self.aggregate.add_downloaded_bytes(self.size)
        return content

+    def get_soup(self):
+        if self.soup is None:
+            self.get_content()
+        return self.soup
+
    def get_raw_content(self):
        if self.data is None:
            self.data = self.download_content()
@ -651,9 +658,9 @@ class UrlBase (object):
    def get_content (self):
        if self.text is None:
            self.get_raw_content()
-            soup = BeautifulSoup(self.data, "html.parser")
-            self.text = self.data.decode(soup.original_encoding)
-            self.encoding = soup.original_encoding
+            self.soup = BeautifulSoup(self.data, "html.parser")
+            self.text = self.data.decode(self.soup.original_encoding)
+            self.encoding = self.soup.original_encoding
        return self.text

    def read_content(self):
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -128,9 +128,9 @@ def find_links (url_data, callback, tags):
    handler.parser = parser
    # parse
    try:
-        content = url_data.get_raw_content()
+        soup = url_data.get_soup()
        with parse_mutex:
-            parser.feed(content)
+            parser.feed_soup(soup)
            parser.flush()
    except linkparse.StopParse as msg:
        log.debug(LOG_CHECK, "Stopped parsing: %s", msg)