mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-12 10:30:58 +00:00
Properly detect HTML character encoding.
This commit is contained in:
parent
689ab9f073
commit
d2ae6bf71c
3 changed files with 35 additions and 4 deletions
|
|
@ -1,3 +1,11 @@
|
|||
7.2 "" (released xx.xx.2011)
|
||||
|
||||
Fixes:
|
||||
- checking: HTML parser now correctly detects character encoding for
|
||||
some sites.
|
||||
Closes: SF bug #3388291
|
||||
|
||||
|
||||
7.1 "A fish called Wanda" (released 6.8.2011)
|
||||
|
||||
Fixes:
|
||||
|
|
|
|||
|
|
@ -221,11 +221,18 @@ def set_encoding (parsobj, attrs):
|
|||
@type attrs: dict
|
||||
@return: None
|
||||
"""
|
||||
if attrs.get_true('http-equiv', u'').lower() == u"content-type":
|
||||
charset = attrs.get_true('charset', u'')
|
||||
if charset:
|
||||
# <meta charset="utf-8">
|
||||
# eg. in http://cn.dolphin-browser.com/activity/Dolphinjump
|
||||
charset = charset.encode('ascii', 'ignore').lower()
|
||||
elif attrs.get_true('http-equiv', u'').lower() == u"content-type":
|
||||
# <meta http-equiv="content-type" content="text/html;charset="utf-8">
|
||||
charset = attrs.get_true('content', u'')
|
||||
charset = get_ctype_charset(charset.encode('ascii', 'ignore'))
|
||||
if charset and charset.lower() in SUPPORTED_CHARSETS:
|
||||
parsobj.encoding = charset
|
||||
charset = charset.encode('ascii', 'ignore').lower()
|
||||
charset = get_ctype_charset(charset)
|
||||
if charset and charset in SUPPORTED_CHARSETS:
|
||||
parsobj.encoding = charset
|
||||
|
||||
|
||||
def get_ctype_charset (text):
|
||||
|
|
|
|||
|
|
@ -282,3 +282,19 @@ class TestParser (unittest.TestCase):
|
|||
|
||||
self.htmlparser.handler = NamePeeker()
|
||||
self.htmlparser.feed(data)
|
||||
|
||||
def test_encoding_detection (self):
|
||||
html = '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
|
||||
self.encoding_test(html, "utf-8")
|
||||
html = '<meta charset="UTF-8">'
|
||||
self.encoding_test(html, "utf-8")
|
||||
html = '<meta charset="hulla">'
|
||||
self.encoding_test(html, "iso8859-1")
|
||||
html = '<meta http-equiv="content-type" content="text/html; charset=blabla">'
|
||||
self.encoding_test(html, "iso8859-1")
|
||||
|
||||
def encoding_test (self, html, expected):
|
||||
parser = linkcheck.HtmlParser.htmlsax.parser()
|
||||
self.assertEqual(parser.encoding, "iso8859-1")
|
||||
parser.feed(html)
|
||||
self.assertEqual(parser.encoding, expected)
|
||||
|
|
|
|||
Loading…
Reference in a new issue