From f7337f55e8112f3e8c80658a9efd99c90b7557e6 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Sat, 23 May 2020 20:01:24 +0100 Subject: [PATCH] Fix error due to an empty html file accessed over http Use the already fixed [1] UrlBase.get_content() in HttpUrl. [1] 5bd1fb4 ("Fix internal error on empty HTML files", 2020-05-21) --- linkcheck/checker/httpurl.py | 6 +----- linkcheck/checker/urlbase.py | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index e0c79254..a0ed556f 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -299,11 +299,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.set_result(_("OK")) def get_content(self): - if self.text is None: - self.get_raw_content() - self.soup = htmlsoup.make_soup(self.data, self.encoding) - self.text = self.data.decode(self.soup.original_encoding) - return self.text + return super().get_content(self.encoding) def read_content(self): """Return data and data size for this URL. diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 1b288ba5..cccb4f38 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -628,10 +628,10 @@ class UrlBase: self.data = self.download_content() return self.data - def get_content(self): + def get_content(self, encoding=None): if self.text is None: self.get_raw_content() - self.soup = htmlsoup.make_soup(self.data) + self.soup = htmlsoup.make_soup(self.data, encoding) # Sometimes soup.original_encoding is None! Better mangled text # than an internal crash, eh? ISO-8859-1 is a safe fallback in the # sense that any binary blob can be decoded, it'll never cause a