From d2ae6bf71c18fd9ff92cd54d6c8088666efafd54 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Sun, 14 Aug 2011 12:49:31 +0200 Subject: [PATCH] Properly detect HTML character encoding. --- doc/changelog.txt | 8 ++++++++ linkcheck/HtmlParser/__init__.py | 15 +++++++++++---- tests/test_parser.py | 16 ++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index 60829fce..1c8b1033 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -1,3 +1,11 @@ +7.2 "" (released xx.xx.2011) + +Fixes: +- checking: HTML parser now correctly detects character encoding for + some sites. + Closes: SF bug #3388291 + + 7.1 "A fish called Wanda" (released 6.8.2011) Fixes: diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py index f00965cb..429ec8d5 100644 --- a/linkcheck/HtmlParser/__init__.py +++ b/linkcheck/HtmlParser/__init__.py @@ -221,11 +221,18 @@ def set_encoding (parsobj, attrs): @type attrs: dict @return: None """ - if attrs.get_true('http-equiv', u'').lower() == u"content-type": + charset = attrs.get_true('charset', u'') + if charset: + # + # eg. in http://cn.dolphin-browser.com/activity/Dolphinjump + charset = charset.encode('ascii', 'ignore').lower() + elif attrs.get_true('http-equiv', u'').lower() == u"content-type": + # charset = attrs.get_true('content', u'') - charset = get_ctype_charset(charset.encode('ascii', 'ignore')) - if charset and charset.lower() in SUPPORTED_CHARSETS: - parsobj.encoding = charset + charset = charset.encode('ascii', 'ignore').lower() + charset = get_ctype_charset(charset) + if charset and charset in SUPPORTED_CHARSETS: + parsobj.encoding = charset def get_ctype_charset (text): diff --git a/tests/test_parser.py b/tests/test_parser.py index 1b710b0d..d53722c4 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -282,3 +282,19 @@ class TestParser (unittest.TestCase): self.htmlparser.handler = NamePeeker() self.htmlparser.feed(data) + + def test_encoding_detection (self): + html = '' + self.encoding_test(html, "utf-8") + html = '' + self.encoding_test(html, "utf-8") + html = '' + self.encoding_test(html, "iso8859-1") + html = '' + self.encoding_test(html, "iso8859-1") + + def encoding_test (self, html, expected): + parser = linkcheck.HtmlParser.htmlsax.parser() + self.assertEqual(parser.encoding, "iso8859-1") + parser.feed(html) + self.assertEqual(parser.encoding, expected)