safe decoding of non-unicode strings

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2322 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-08 14:44:46 +00:00 · 2005-02-18 11:22:52 +00:00 · 2005-02-18 11:22:52 +00:00 · 703da9eb65
commit 703da9eb65
parent 8442a98508
6 changed files with 45 additions and 27 deletions
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -31,6 +31,7 @@ import nntplib
 import ftplib

 import linkcheck.httplib2
+import linkcheck.strformat
 import linkcheck.dns.exception


@ -177,7 +178,7 @@ def set_intern_url (url, klass, config):
    elif klass in [linkcheck.checker.httpurl.HttpUrl,
                   linkcheck.checker.httpsurl.HttpsUrl,
                   linkcheck.checker.ftpurl.FtpUrl]:
-        domain = linkcheck.url.url_unicode_split(url)[1]
+        domain = linkcheck.strformat.url_unicode_split(url)[1]
        domain, is_idn = linkcheck.url.idna_encode(domain)
        if domain:
            domain = "://%s" % re.escape(domain)
@ -234,15 +235,12 @@ def get_url_from (base_url, recursion_level, consumer,
    @param cmdline: flag if url was given on command line
    @type cmdline: bool
    """
-    default_encoding = "iso8859-15"
-    if not isinstance(base_url, unicode):
-        base_url = unicode(base_url, default_encoding, "ignore")
-    if parent_url is not None and not isinstance(parent_url, unicode):
-        parent_url = unicode(parent_url, default_encoding, "ignore")
-    if base_ref is not None and not isinstance(base_ref, unicode):
-        base_ref = unicode(base_ref, default_encoding, "ignore")
-    if not isinstance(name, unicode):
-        name = unicode(name, default_encoding, "ignore")
+    base_url = linkcheck.strformat.unicode_safe(base_url)
+    if parent_url is not None:
+        parent_url = linkcheck.strformat.unicode_safe(parent_url)
+    if base_ref is not None:
+        base_ref = linkcheck.strformat.unicode_safe(base_ref)
+    name = linkcheck.strformat.unicode_safe(name)
    #if cmdline and linkcheck.url.url_needs_quoting(base_url):
    #    base_url = linkcheck.url.url_quote(base_url)
    url = absolute_url(base_url, base_ref, parent_url)
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@ -113,7 +113,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
            raise linkcheck.LinkCheckerError, \
                   _("Got no answer from FTP server")
        # don't set info anymore, this may change every time we log in
-        #self.add_info(unicode(info))
+        #self.add_info(linkcheck.strformat.unicode_safe(info))

    def cwd (self):
        """
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -29,6 +29,7 @@ import Cookie

 import linkcheck
 import linkcheck.url
+import linkcheck.strformat
 import linkcheck.robotparser2
 import linkcheck.httplib2
 import urlbase
@ -221,6 +222,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                    continue
                raise
            self.headers = response.msg
+            if response.reason:
+                response.reason = \
+                        linkcheck.strformat.unicode_safe(response.reason)
            linkcheck.log.debug(linkcheck.LOG_CHECK, "Response: %s %s",
                                response.status, response.reason)
            linkcheck.log.debug(linkcheck.LOG_CHECK, "Headers: %s",
@ -313,8 +317,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                         self.headers.getheader("Uri", ""))
            # make new url absolute and unicode
            newurl = urlparse.urljoin(redirected, newurl)
-            if not isinstance(newurl, unicode):
-                newurl = unicode(newurl, "iso8859-1", "ignore")
+            newurl = linkcheck.strformat.unicode_safe(newurl)
            linkcheck.log.debug(linkcheck.LOG_CHECK, "Redirected to %r",
                                newurl)
            self.add_info(_("Redirected to %(url)s.") % {'url': newurl})
@ -346,7 +349,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
            # remember redireced url as alias
            self.aliases.append(redirected)
            # note: urlparts has to be a list
-            self.urlparts = linkcheck.url.url_unicode_split(redirected)
+            self.urlparts = linkcheck.strformat.url_unicode_split(redirected)
            if response.status == 301:
                if not self.has301status:
                    self.add_warning(
@ -405,7 +408,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                                   " anchor from request.") % server)
            if response.status == 204:
                # no content
-                self.add_warning(unicode(response.reason))
+                self.add_warning(
+                            linkcheck.strformat.unicode_safe(response.reason))
            # store cookies for valid links
            if self.consumer.config['cookies']:
                for c in self.cookies:
@ -414,7 +418,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                    out = self.consumer.cache.store_cookies(self.headers,
                                                            self.urlparts[1])
                    for h in out:
-                        self.add_info(unicode(h))
+                        self.add_info(linkcheck.strformat.unicode_safe(h))
                except Cookie.CookieError, msg:
                    self.add_warning(_("Could not store cookies: %(msg)s.") %
                                     {'msg': str(msg)})
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -271,7 +271,8 @@ class UrlBase (object):
        try:
            self.build_url()
        except linkcheck.LinkCheckerError, msg:
-            self.set_result(unicode(str(msg)), valid=False)
+            self.set_result(linkcheck.strformat.unicode_safe(msg),
+                            valid=False)
            return False
        self.set_cache_keys()
        self.extern = self._get_extern(self.url)
@ -305,7 +306,7 @@ class UrlBase (object):
        else:
            self.url = base_url
        # split into (modifiable) list
-        self.urlparts = linkcheck.url.url_unicode_split(self.url)
+        self.urlparts = linkcheck.strformat.url_unicode_split(self.url)
        # and unsplit again
        self.url = urlparse.urlunsplit(self.urlparts)
        # check userinfo@host:port syntax
@ -375,7 +376,8 @@ class UrlBase (object):
            # make nicer error msg for bad status line
            if isinstance(evalue, linkcheck.httplib2.BadStatusLine):
                evalue = _('Bad HTTP response %r') % str(evalue)
-            self.set_result(unicode(str(evalue)), valid=False)
+            self.set_result(linkcheck.strformat.unicode_safe(evalue),
+                            valid=False)

        # check content
        warningregex = self.consumer.config["warningregex"]
@ -387,7 +389,8 @@ class UrlBase (object):
                value, tb = sys.exc_info()[1:]
                linkcheck.log.debug(linkcheck.LOG_CHECK, "exception %s",
                                    traceback.format_tb(tb))
-                self.set_result(unicode(str(value)), valid=False)
+                self.set_result(linkcheck.strformat.unicode_safe(value),
+                                valid=False)

        self.checktime = time.time() - t
        # check recursion
@ -686,7 +689,7 @@ class UrlBase (object):
        """
        Return serialized url check data as unicode string.
        """
-        sep = unicode(os.linesep)
+        sep = linkcheck.strformat.unicode_safe(os.linesep)
        assert isinstance(self.base_url, unicode), self
        if self.parent_url is not None:
            assert isinstance(self.parent_url, unicode), self
--- a/linkcheck/strformat.py
+++ b/linkcheck/strformat.py
@ -24,6 +24,25 @@ import textwrap
 import sys
 import os
 import time
+import urlparse
+
+
+def unicode_safe (s, encoding="iso-8859-1"):
+    """
+    Return unicode string without raising encoding errors. Unknown
+    characters of the given encoding will be ignored.
+    """
+    assert s is not None, "argument to unicode_safe was None"
+    if isinstance(s, unicode):
+        return s
+    return unicode(s, encoding, "ignore")
+
+
+def url_unicode_split (url):
+    """
+    Like urlparse.urlsplit(), but always returning unicode parts.
+    """
+    return [unicode_safe(s) for s in urlparse.urlsplit(url)]


 def unquote (s):
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -444,9 +444,3 @@ def url_split (url):
        host, port = urllib.splitnport(host, port)
    return scheme, host, port, document

-
-def url_unicode_split (url):
-    """
-    Like urlparse.urlsplit(), but always returning unicode parts.
-    """
-    return [unicode(s) for s in urlparse.urlsplit(url)]