From d2ae6bf71c18fd9ff92cd54d6c8088666efafd54 Mon Sep 17 00:00:00 2001
From: Bastian Kleineidam <calvin@debian.org>
Date: Sun, 14 Aug 2011 12:49:31 +0200
Subject: [PATCH] Properly detect HTML character encoding.

---
 doc/changelog.txt                |  8 ++++++++
 linkcheck/HtmlParser/__init__.py | 15 +++++++++++----
 tests/test_parser.py             | 16 ++++++++++++++++
 3 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/doc/changelog.txt b/doc/changelog.txt
index 60829fce..1c8b1033 100644
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@@ -1,3 +1,11 @@
+7.2 "" (released xx.xx.2011)
+
+Fixes:
+- checking: HTML parser now correctly detects character encoding for
+  some sites.
+  Closes: SF bug #3388291
+
+
 7.1 "A fish called Wanda" (released 6.8.2011)
 
 Fixes:
diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py
index f00965cb..429ec8d5 100644
--- a/linkcheck/HtmlParser/__init__.py
+++ b/linkcheck/HtmlParser/__init__.py
@@ -221,11 +221,18 @@ def set_encoding (parsobj, attrs):
     @type attrs: dict
     @return: None
     """
-    if attrs.get_true('http-equiv', u'').lower() == u"content-type":
+    charset = attrs.get_true('charset', u'')
+    if charset:
+        # <meta charset="utf-8">
+        # eg. in http://cn.dolphin-browser.com/activity/Dolphinjump
+        charset = charset.encode('ascii', 'ignore').lower()
+    elif attrs.get_true('http-equiv', u'').lower() == u"content-type":
+        # <meta http-equiv="content-type" content="text/html;charset="utf-8">
         charset = attrs.get_true('content', u'')
-        charset = get_ctype_charset(charset.encode('ascii', 'ignore'))
-        if charset and charset.lower() in SUPPORTED_CHARSETS:
-            parsobj.encoding = charset
+        charset = charset.encode('ascii', 'ignore').lower()
+        charset = get_ctype_charset(charset)
+    if charset and charset in SUPPORTED_CHARSETS:
+        parsobj.encoding = charset
 
 
 def get_ctype_charset (text):
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 1b710b0d..d53722c4 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -282,3 +282,19 @@ class TestParser (unittest.TestCase):
 
         self.htmlparser.handler = NamePeeker()
         self.htmlparser.feed(data)
+
+    def test_encoding_detection (self):
+        html = '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
+        self.encoding_test(html, "utf-8")
+        html = '<meta charset="UTF-8">'
+        self.encoding_test(html, "utf-8")
+        html = '<meta charset="hulla">'
+        self.encoding_test(html, "iso8859-1")
+        html = '<meta http-equiv="content-type" content="text/html; charset=blabla">'
+        self.encoding_test(html, "iso8859-1")
+
+    def encoding_test (self, html, expected):
+        parser = linkcheck.HtmlParser.htmlsax.parser()
+        self.assertEqual(parser.encoding, "iso8859-1")
+        parser.feed(html)
+        self.assertEqual(parser.encoding, expected)