Properly detect HTML character encoding.

2026-05-27 15:24:00 +00:00 · 2011-08-14 12:49:31 +02:00 · 2011-08-14 12:49:31 +02:00 · d2ae6bf71c
commit d2ae6bf71c
parent 689ab9f073
3 changed files with 35 additions and 4 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -1,3 +1,11 @@
+7.2 "" (released xx.xx.2011)
+
+Fixes:
+- checking: HTML parser now correctly detects character encoding for
+  some sites.
+  Closes: SF bug #3388291
+
+
 7.1 "A fish called Wanda" (released 6.8.2011)

 Fixes:
--- a/linkcheck/HtmlParser/init.py
+++ b/linkcheck/HtmlParser/init.py
@ -221,11 +221,18 @@ def set_encoding (parsobj, attrs):
    @type attrs: dict
    @return: None
    """
-    if attrs.get_true('http-equiv', u'').lower() == u"content-type":
+    charset = attrs.get_true('charset', u'')
+    if charset:
+        # <meta charset="utf-8">
+        # eg. in http://cn.dolphin-browser.com/activity/Dolphinjump
+        charset = charset.encode('ascii', 'ignore').lower()
+    elif attrs.get_true('http-equiv', u'').lower() == u"content-type":
+        # <meta http-equiv="content-type" content="text/html;charset="utf-8">
        charset = attrs.get_true('content', u'')
-        charset = get_ctype_charset(charset.encode('ascii', 'ignore'))
-        if charset and charset.lower() in SUPPORTED_CHARSETS:
-            parsobj.encoding = charset
+        charset = charset.encode('ascii', 'ignore').lower()
+        charset = get_ctype_charset(charset)
+    if charset and charset in SUPPORTED_CHARSETS:
+        parsobj.encoding = charset


 def get_ctype_charset (text):
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -282,3 +282,19 @@ class TestParser (unittest.TestCase):

        self.htmlparser.handler = NamePeeker()
        self.htmlparser.feed(data)
+
+    def test_encoding_detection (self):
+        html = '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
+        self.encoding_test(html, "utf-8")
+        html = '<meta charset="UTF-8">'
+        self.encoding_test(html, "utf-8")
+        html = '<meta charset="hulla">'
+        self.encoding_test(html, "iso8859-1")
+        html = '<meta http-equiv="content-type" content="text/html; charset=blabla">'
+        self.encoding_test(html, "iso8859-1")
+
+    def encoding_test (self, html, expected):
+        parser = linkcheck.HtmlParser.htmlsax.parser()
+        self.assertEqual(parser.encoding, "iso8859-1")
+        parser.feed(html)
+        self.assertEqual(parser.encoding, expected)