From f0b911b608e5a06b172407253414e80dcbf1c0b7 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Sun, 21 Nov 2010 20:19:27 +0100 Subject: [PATCH] Use codecs module for proper output encoding. --- doc/changelog.txt | 2 ++ linkcheck/logger/__init__.py | 53 +++++++++++++---------------------- linkcheck/logger/blacklist.py | 8 +++--- linkcheck/logger/dot.py | 2 +- linkcheck/logger/html.py | 2 +- 5 files changed, 28 insertions(+), 39 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index 96488a88..24ae43a2 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -11,6 +11,8 @@ Changes: --no-proxy-for. - config: Remove backwards compatilibity parsing and require the new multiline configuration syntax. +- logging: Use codecs module for proper output encoding. + Closes: SF bug #3114624 Features: - gui: Store column widths in registry settings. diff --git a/linkcheck/logger/__init__.py b/linkcheck/logger/__init__.py index a9a3bb61..3117a16e 100644 --- a/linkcheck/logger/__init__.py +++ b/linkcheck/logger/__init__.py @@ -22,8 +22,9 @@ import sys import os import datetime import time +import codecs from ..decorators import notimplemented -from .. import log, LOG_CHECK, strformat, dummy, configuration +from .. import log, LOG_CHECK, strformat, dummy, configuration, i18n _ = lambda x: x Fields = dict( @@ -88,8 +89,21 @@ class Logger (object): self.warnings = 0 # number of warnings that were printed self.warnings_printed = 0 - # encoding of output (default is utf-8) - self.output_encoding = args.get("encoding", "utf-8") + # encoding of output + encoding = args.get("encoding", i18n.default_encoding) + try: + encoding = codecs.lookup(encoding).name + except LookupError: + encoding = i18n.default_encoding + self.output_encoding = encoding + # how to handle codec errors + self.codec_errors = "replace" + + def get_charset_encoding (self): + """Translate the output encoding to a charset encoding name.""" + if self.output_encoding == "utf-8-sig": + return "utf-8" + return self.output_encoding def init_fileoutput (self, args): """ @@ -113,7 +127,8 @@ class Logger (object): try: if path and not os.path.isdir(path): os.makedirs(path) - self.fd = file(self.filename, "wb") + self.fd = codecs.open(self.filename, "wb", self.output_encoding, + self.codec_errors) self.close_fd = True except IOError: msg = sys.exc_info()[1] @@ -133,34 +148,6 @@ class Logger (object): self.fd.close() self.fd = None - def encode (self, s): - """ - Encode string with configured output encoding. Wrong encoded - characters are replaced. - - @param s: string to encode - @type s: unicode - @return: encoded string - @rtype: string - """ - if not isinstance(s, unicode): - raise ValueError("tried to encode non-unicode string %r" % s) - return s.encode(self.output_encoding, "replace") - - def decode (self, s): - """ - Decode string with configured output encoding. Wrong decoded - characters are replaced. - - @param s: string to decode - @type s: string - @return: encoded string - @rtype: unicode - """ - if isinstance(s, unicode): - return s - return s.decode(self.output_encoding, "replace") - def check_date (self): """ Check for special dates. @@ -200,7 +187,7 @@ class Logger (object): log.warn(LOG_CHECK, "writing to unitialized or closed file") else: - self.fd.write(self.encode(s), **args) + self.fd.write(s, **args) def writeln (self, s=u"", **args): """ diff --git a/linkcheck/logger/blacklist.py b/linkcheck/logger/blacklist.py index 555e471c..4d1523ac 100644 --- a/linkcheck/logger/blacklist.py +++ b/linkcheck/logger/blacklist.py @@ -19,8 +19,8 @@ A blacklist logger. """ import os +import codecs from . import Logger -from .. import i18n class BlacklistLogger (Logger): @@ -35,7 +35,6 @@ class BlacklistLogger (Logger): Intialize with old blacklist data (if found, else not). """ super(BlacklistLogger, self).__init__(**args) - self.output_encoding = args.get("encoding", i18n.default_encoding) self.init_fileoutput(args) self.blacklist = {} if self.filename is not None and os.path.exists(self.filename): @@ -72,9 +71,10 @@ class BlacklistLogger (Logger): """ Read a previously stored blacklist from file fd. """ - with open(self.filename) as fd: + with codecs.open(self.filename, 'r', self.output_encoding, + self.codec_errors) as fd: for line in fd: - line = self.decode(line.rstrip()) + line = line.rstrip() if line.startswith('#') or not line: continue value, key = line.split(None, 1) diff --git a/linkcheck/logger/dot.py b/linkcheck/logger/dot.py index 02da2bdb..9d24e6d3 100644 --- a/linkcheck/logger/dot.py +++ b/linkcheck/logger/dot.py @@ -34,7 +34,7 @@ class DOTLogger (GraphLogger): self.writeln() self.writeln(u"digraph G {") self.writeln(u" graph [") - self.writeln(u" charset=\"%s\"," % self.output_encoding) + self.writeln(u" charset=\"%s\"," % self.get_charset_encoding()) self.writeln(u" ];") self.flush() diff --git a/linkcheck/logger/html.py b/linkcheck/logger/html.py index 539389f5..3b8eaaab 100644 --- a/linkcheck/logger/html.py +++ b/linkcheck/logger/html.py @@ -87,7 +87,7 @@ class HtmlLogger (Logger): """Write start of checking info.""" super(HtmlLogger, self).start_output() header = { - "encoding": self.output_encoding, + "encoding": self.get_charset_encoding(), "title": configuration.App, "body": self.colorbackground, "link": self.colorlink,