Properly detect HTML character encoding.

This commit is contained in:
Bastian Kleineidam 2011-08-14 12:49:31 +02:00
parent 689ab9f073
commit d2ae6bf71c
3 changed files with 35 additions and 4 deletions

View file

@ -1,3 +1,11 @@
7.2 "" (released xx.xx.2011)
Fixes:
- checking: HTML parser now correctly detects character encoding for
some sites.
Closes: SF bug #3388291
7.1 "A fish called Wanda" (released 6.8.2011)
Fixes:

View file

@ -221,11 +221,18 @@ def set_encoding (parsobj, attrs):
@type attrs: dict
@return: None
"""
if attrs.get_true('http-equiv', u'').lower() == u"content-type":
charset = attrs.get_true('charset', u'')
if charset:
# <meta charset="utf-8">
# eg. in http://cn.dolphin-browser.com/activity/Dolphinjump
charset = charset.encode('ascii', 'ignore').lower()
elif attrs.get_true('http-equiv', u'').lower() == u"content-type":
# <meta http-equiv="content-type" content="text/html;charset="utf-8">
charset = attrs.get_true('content', u'')
charset = get_ctype_charset(charset.encode('ascii', 'ignore'))
if charset and charset.lower() in SUPPORTED_CHARSETS:
parsobj.encoding = charset
charset = charset.encode('ascii', 'ignore').lower()
charset = get_ctype_charset(charset)
if charset and charset in SUPPORTED_CHARSETS:
parsobj.encoding = charset
def get_ctype_charset (text):

View file

@ -282,3 +282,19 @@ class TestParser (unittest.TestCase):
self.htmlparser.handler = NamePeeker()
self.htmlparser.feed(data)
def test_encoding_detection (self):
html = '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
self.encoding_test(html, "utf-8")
html = '<meta charset="UTF-8">'
self.encoding_test(html, "utf-8")
html = '<meta charset="hulla">'
self.encoding_test(html, "iso8859-1")
html = '<meta http-equiv="content-type" content="text/html; charset=blabla">'
self.encoding_test(html, "iso8859-1")
def encoding_test (self, html, expected):
parser = linkcheck.HtmlParser.htmlsax.parser()
self.assertEqual(parser.encoding, "iso8859-1")
parser.feed(html)
self.assertEqual(parser.encoding, expected)