diff --git a/doc/changelog.txt b/doc/changelog.txt
index 60829fce..1c8b1033 100644
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@@ -1,3 +1,11 @@
+7.2 "" (released xx.xx.2011)
+
+Fixes:
+- checking: HTML parser now correctly detects character encoding for
+ some sites.
+ Closes: SF bug #3388291
+
+
7.1 "A fish called Wanda" (released 6.8.2011)
Fixes:
diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py
index f00965cb..429ec8d5 100644
--- a/linkcheck/HtmlParser/__init__.py
+++ b/linkcheck/HtmlParser/__init__.py
@@ -221,11 +221,18 @@ def set_encoding (parsobj, attrs):
@type attrs: dict
@return: None
"""
- if attrs.get_true('http-equiv', u'').lower() == u"content-type":
+ charset = attrs.get_true('charset', u'')
+ if charset:
+ #
+ # eg. in http://cn.dolphin-browser.com/activity/Dolphinjump
+ charset = charset.encode('ascii', 'ignore').lower()
+ elif attrs.get_true('http-equiv', u'').lower() == u"content-type":
+ #
charset = attrs.get_true('content', u'')
- charset = get_ctype_charset(charset.encode('ascii', 'ignore'))
- if charset and charset.lower() in SUPPORTED_CHARSETS:
- parsobj.encoding = charset
+ charset = charset.encode('ascii', 'ignore').lower()
+ charset = get_ctype_charset(charset)
+ if charset and charset in SUPPORTED_CHARSETS:
+ parsobj.encoding = charset
def get_ctype_charset (text):
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 1b710b0d..d53722c4 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -282,3 +282,19 @@ class TestParser (unittest.TestCase):
self.htmlparser.handler = NamePeeker()
self.htmlparser.feed(data)
+
+ def test_encoding_detection (self):
+ html = ''
+ self.encoding_test(html, "utf-8")
+ html = ''
+ self.encoding_test(html, "utf-8")
+ html = ''
+ self.encoding_test(html, "iso8859-1")
+ html = ''
+ self.encoding_test(html, "iso8859-1")
+
+ def encoding_test (self, html, expected):
+ parser = linkcheck.HtmlParser.htmlsax.parser()
+ self.assertEqual(parser.encoding, "iso8859-1")
+ parser.feed(html)
+ self.assertEqual(parser.encoding, expected)