Fix error due to an empty html file accessed over http

Use the already fixed [1] UrlBase.get_content() in HttpUrl. [1] 5bd1fb4 ("Fix internal error on empty HTML files", 2020-05-21)
2026-05-08 22:54:51 +00:00 · 2020-05-23 20:01:24 +01:00 · 2020-05-23 20:01:24 +01:00 · f7337f55e8
commit f7337f55e8
parent d611564cb0
2 changed files with 3 additions and 7 deletions
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -299,11 +299,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
                self.set_result(_("OK"))

    def get_content(self):
-        if self.text is None:
-            self.get_raw_content()
-            self.soup = htmlsoup.make_soup(self.data, self.encoding)
-            self.text = self.data.decode(self.soup.original_encoding)
-        return self.text
+        return super().get_content(self.encoding)

    def read_content(self):
        """Return data and data size for this URL.
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -628,10 +628,10 @@ class UrlBase:
            self.data = self.download_content()
        return self.data

-    def get_content(self):
+    def get_content(self, encoding=None):
        if self.text is None:
            self.get_raw_content()
-            self.soup = htmlsoup.make_soup(self.data)
+            self.soup = htmlsoup.make_soup(self.data, encoding)
            # Sometimes soup.original_encoding is None!  Better mangled text
            # than an internal crash, eh?  ISO-8859-1 is a safe fallback in the
            # sense that any binary blob can be decoded, it'll never cause a