Fix internal error on empty HTML files

When BeautifulSoup finds an empty file on disk, it sets
original_encoding to None.  It doesn't matter what encoding we pick for
empty files, so let's just pick one.

I don't know if there are any circumstances where BeautifulSoup might
set the encoding to None for a non-empty file.

Closes #392.
This commit is contained in:
Marius Gedminas 2020-05-21 19:01:33 +03:00
parent a226b4e406
commit 5bd1fb4e36
4 changed files with 12 additions and 2 deletions

View file

@ -632,8 +632,10 @@ class UrlBase:
if self.text is None:
self.get_raw_content()
self.soup = htmlsoup.make_soup(self.data)
self.text = self.data.decode(self.soup.original_encoding)
self.encoding = self.soup.original_encoding
# Sometimes soup.original_encoding is None! Better mangled text
# than an internal crash, eh?
self.encoding = self.soup.original_encoding or 'ISO-8859-1'
self.text = self.data.decode(self.encoding)
return self.text
def read_content(self):

View file

View file

@ -0,0 +1,5 @@
url file://%(curdir)s/%(datadir)s/empty.html
cache key file://%(curdir)s/%(datadir)s/empty.html
real url file://%(curdir)s/%(datadir)s/empty.html
name %(datadir)s/empty.html
valid

View file

@ -68,6 +68,9 @@ class TestFile(LinkCheckTest):
def test_php(self):
self.file_test("file.php")
def test_empty(self):
self.file_test("empty.html")
@need_word
def test_word(self):
confargs = dict(enabledplugins=["WordParser"])