mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Fix internal error on empty HTML files
When BeautifulSoup finds an empty file on disk, it sets original_encoding to None. It doesn't matter what encoding we pick for empty files, so let's just pick one. I don't know if there are any circumstances where BeautifulSoup might set the encoding to None for a non-empty file. Closes #392.
This commit is contained in:
parent
a226b4e406
commit
5bd1fb4e36
4 changed files with 12 additions and 2 deletions
|
|
@ -632,8 +632,10 @@ class UrlBase:
|
|||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = htmlsoup.make_soup(self.data)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
self.encoding = self.soup.original_encoding
|
||||
# Sometimes soup.original_encoding is None! Better mangled text
|
||||
# than an internal crash, eh?
|
||||
self.encoding = self.soup.original_encoding or 'ISO-8859-1'
|
||||
self.text = self.data.decode(self.encoding)
|
||||
return self.text
|
||||
|
||||
def read_content(self):
|
||||
|
|
|
|||
0
tests/checker/data/empty.html
Normal file
0
tests/checker/data/empty.html
Normal file
5
tests/checker/data/empty.html.result
Normal file
5
tests/checker/data/empty.html.result
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
url file://%(curdir)s/%(datadir)s/empty.html
|
||||
cache key file://%(curdir)s/%(datadir)s/empty.html
|
||||
real url file://%(curdir)s/%(datadir)s/empty.html
|
||||
name %(datadir)s/empty.html
|
||||
valid
|
||||
|
|
@ -68,6 +68,9 @@ class TestFile(LinkCheckTest):
|
|||
def test_php(self):
|
||||
self.file_test("file.php")
|
||||
|
||||
def test_empty(self):
|
||||
self.file_test("empty.html")
|
||||
|
||||
@need_word
|
||||
def test_word(self):
|
||||
confargs = dict(enabledplugins=["WordParser"])
|
||||
|
|
|
|||
Loading…
Reference in a new issue