From 5bd1fb4e36f19455c7c291eb0616236043b5a7d3 Mon Sep 17 00:00:00 2001 From: Marius Gedminas Date: Thu, 21 May 2020 19:01:33 +0300 Subject: [PATCH] Fix internal error on empty HTML files When BeautifulSoup finds an empty file on disk, it sets original_encoding to None. It doesn't matter what encoding we pick for empty files, so let's just pick one. I don't know if there are any circumstances where BeautifulSoup might set the encoding to None for a non-empty file. Closes #392. --- linkcheck/checker/urlbase.py | 6 ++++-- tests/checker/data/empty.html | 0 tests/checker/data/empty.html.result | 5 +++++ tests/checker/test_file.py | 3 +++ 4 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 tests/checker/data/empty.html create mode 100644 tests/checker/data/empty.html.result diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 57f0d28a..f4081301 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -632,8 +632,10 @@ class UrlBase: if self.text is None: self.get_raw_content() self.soup = htmlsoup.make_soup(self.data) - self.text = self.data.decode(self.soup.original_encoding) - self.encoding = self.soup.original_encoding + # Sometimes soup.original_encoding is None! Better mangled text + # than an internal crash, eh? + self.encoding = self.soup.original_encoding or 'ISO-8859-1' + self.text = self.data.decode(self.encoding) return self.text def read_content(self): diff --git a/tests/checker/data/empty.html b/tests/checker/data/empty.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/checker/data/empty.html.result b/tests/checker/data/empty.html.result new file mode 100644 index 00000000..7df3f536 --- /dev/null +++ b/tests/checker/data/empty.html.result @@ -0,0 +1,5 @@ +url file://%(curdir)s/%(datadir)s/empty.html +cache key file://%(curdir)s/%(datadir)s/empty.html +real url file://%(curdir)s/%(datadir)s/empty.html +name %(datadir)s/empty.html +valid diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py index ba9ebc15..e91c144d 100644 --- a/tests/checker/test_file.py +++ b/tests/checker/test_file.py @@ -68,6 +68,9 @@ class TestFile(LinkCheckTest): def test_php(self): self.file_test("file.php") + def test_empty(self): + self.file_test("empty.html") + @need_word def test_word(self): confargs = dict(enabledplugins=["WordParser"])