diff --git a/linkcheck/HtmlParser/htmllib.py b/linkcheck/HtmlParser/htmllib.py index 0a8980e3..2a1dfc2b 100644 --- a/linkcheck/HtmlParser/htmllib.py +++ b/linkcheck/HtmlParser/htmllib.py @@ -99,7 +99,7 @@ class HtmlPrettyPrinter (object): @type attrs: dict @return: None """ - self._start_element(tag, attrs, ">") + self._start_element(tag, attrs, u">") def start_end_element (self, tag, attrs, element_text=None): """ @@ -111,7 +111,7 @@ class HtmlPrettyPrinter (object): @type attrs: dict @return: None """ - self._start_element(tag, attrs, "/>") + self._start_element(tag, attrs, u"/>") def _start_element (self, tag, attrs, end): """ @@ -125,12 +125,12 @@ class HtmlPrettyPrinter (object): @type end: string @return: None """ - self.fd.write("<%s" % tag.replace("/", "")) + self.fd.write(u"<%s" % tag.replace("/", "")) for key, val in attrs.items(): if val is None: - self.fd.write(" %s" % key) + self.fd.write(u" %s" % key) else: - self.fd.write(' %s="%s"' % (key, quote_attrval(val))) + self.fd.write(u' %s="%s"' % (key, quote_attrval(val))) self.fd.write(end) def end_element (self, tag): diff --git a/tests/test_parser.py b/tests/test_parser.py index 4465c19b..054d73e5 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,4 +1,4 @@ -# -*- coding: iso-8859-1 -*- +# -*- coding: utf8 -*- # Copyright (C) 2004-2012 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify @@ -54,9 +54,9 @@ parsetests = [ ("""< a >""", """< a >"""), ("""<>""", """<>"""), ("""< >""", """< >"""), - ("""""", u""""""), - ("""""", u""""""), - ("""""", u""""""), + ("""""", u""""""), + ("""""", u""""""), + ("""""", u""""""), # multiple attribute names should be ignored... ("""""", """"""), # ... but which one wins - in our implementation the last one @@ -97,7 +97,7 @@ parsetests = [ ("""""", """"""), ("""< / a>""", """< / a>"""), ("""< /a>""", """< /a>"""), - ("""""", """"""), + ("""""", """"""), # start and end tag (HTML doctype assumed) ("""""", """"""), ("""""", """"""), @@ -144,8 +144,8 @@ parsetests = [ # note that \u8156 is not valid encoding and therefore gets removed ("""""", """"""), # non-ascii characters - ("""<Üzgür> fahr ¿¿¿¿¿¿{""", - u"""<Üzgür> fahr ¿¿¿¿¿¿{"""), + ("""<Üzgür> fahr żżżżżż{""", + u"""<Üzgür> fahr żżżżżż{"""), # mailto link ("""1""", """1"""),