From 703da9eb65414ca17966b244d40d9295317049a0 Mon Sep 17 00:00:00 2001
From: calvin <calvin@e7d03fd6-7b0d-0410-9947-9c21f3af8025>
Date: Fri, 18 Feb 2005 11:22:52 +0000
Subject: [PATCH] safe decoding of non-unicode strings

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2322 e7d03fd6-7b0d-0410-9947-9c21f3af8025
---
 linkcheck/checker/__init__.py | 18 ++++++++----------
 linkcheck/checker/ftpurl.py   |  2 +-
 linkcheck/checker/httpurl.py  | 14 +++++++++-----
 linkcheck/checker/urlbase.py  | 13 ++++++++-----
 linkcheck/strformat.py        | 19 +++++++++++++++++++
 linkcheck/url.py              |  6 ------
 6 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py
index 4dbcad63..800dc55a 100644
--- a/linkcheck/checker/__init__.py
+++ b/linkcheck/checker/__init__.py
@@ -31,6 +31,7 @@ import nntplib
 import ftplib
 
 import linkcheck.httplib2
+import linkcheck.strformat
 import linkcheck.dns.exception
 
 
@@ -177,7 +178,7 @@ def set_intern_url (url, klass, config):
     elif klass in [linkcheck.checker.httpurl.HttpUrl,
                    linkcheck.checker.httpsurl.HttpsUrl,
                    linkcheck.checker.ftpurl.FtpUrl]:
-        domain = linkcheck.url.url_unicode_split(url)[1]
+        domain = linkcheck.strformat.url_unicode_split(url)[1]
         domain, is_idn = linkcheck.url.idna_encode(domain)
         if domain:
             domain = "://%s" % re.escape(domain)
@@ -234,15 +235,12 @@ def get_url_from (base_url, recursion_level, consumer,
     @param cmdline: flag if url was given on command line
     @type cmdline: bool
     """
-    default_encoding = "iso8859-15"
-    if not isinstance(base_url, unicode):
-        base_url = unicode(base_url, default_encoding, "ignore")
-    if parent_url is not None and not isinstance(parent_url, unicode):
-        parent_url = unicode(parent_url, default_encoding, "ignore")
-    if base_ref is not None and not isinstance(base_ref, unicode):
-        base_ref = unicode(base_ref, default_encoding, "ignore")
-    if not isinstance(name, unicode):
-        name = unicode(name, default_encoding, "ignore")
+    base_url = linkcheck.strformat.unicode_safe(base_url)
+    if parent_url is not None:
+        parent_url = linkcheck.strformat.unicode_safe(parent_url)
+    if base_ref is not None:
+        base_ref = linkcheck.strformat.unicode_safe(base_ref)
+    name = linkcheck.strformat.unicode_safe(name)
     #if cmdline and linkcheck.url.url_needs_quoting(base_url):
     #    base_url = linkcheck.url.url_quote(base_url)
     url = absolute_url(base_url, base_ref, parent_url)
diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py
index 8ff813f1..ee52b070 100644
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@@ -113,7 +113,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
             raise linkcheck.LinkCheckerError, \
                    _("Got no answer from FTP server")
         # don't set info anymore, this may change every time we log in
-        #self.add_info(unicode(info))
+        #self.add_info(linkcheck.strformat.unicode_safe(info))
 
     def cwd (self):
         """
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 2b3cf5c7..529d91a6 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -29,6 +29,7 @@ import Cookie
 
 import linkcheck
 import linkcheck.url
+import linkcheck.strformat
 import linkcheck.robotparser2
 import linkcheck.httplib2
 import urlbase
@@ -221,6 +222,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                     continue
                 raise
             self.headers = response.msg
+            if response.reason:
+                response.reason = \
+                        linkcheck.strformat.unicode_safe(response.reason)
             linkcheck.log.debug(linkcheck.LOG_CHECK, "Response: %s %s",
                                 response.status, response.reason)
             linkcheck.log.debug(linkcheck.LOG_CHECK, "Headers: %s",
@@ -313,8 +317,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                          self.headers.getheader("Uri", ""))
             # make new url absolute and unicode
             newurl = urlparse.urljoin(redirected, newurl)
-            if not isinstance(newurl, unicode):
-                newurl = unicode(newurl, "iso8859-1", "ignore")
+            newurl = linkcheck.strformat.unicode_safe(newurl)
             linkcheck.log.debug(linkcheck.LOG_CHECK, "Redirected to %r",
                                 newurl)
             self.add_info(_("Redirected to %(url)s.") % {'url': newurl})
@@ -346,7 +349,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
             # remember redireced url as alias
             self.aliases.append(redirected)
             # note: urlparts has to be a list
-            self.urlparts = linkcheck.url.url_unicode_split(redirected)
+            self.urlparts = linkcheck.strformat.url_unicode_split(redirected)
             if response.status == 301:
                 if not self.has301status:
                     self.add_warning(
@@ -405,7 +408,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                                    " anchor from request.") % server)
             if response.status == 204:
                 # no content
-                self.add_warning(unicode(response.reason))
+                self.add_warning(
+                            linkcheck.strformat.unicode_safe(response.reason))
             # store cookies for valid links
             if self.consumer.config['cookies']:
                 for c in self.cookies:
@@ -414,7 +418,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                     out = self.consumer.cache.store_cookies(self.headers,
                                                             self.urlparts[1])
                     for h in out:
-                        self.add_info(unicode(h))
+                        self.add_info(linkcheck.strformat.unicode_safe(h))
                 except Cookie.CookieError, msg:
                     self.add_warning(_("Could not store cookies: %(msg)s.") %
                                      {'msg': str(msg)})
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index 3b8979e9..f8dd6c73 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -271,7 +271,8 @@ class UrlBase (object):
         try:
             self.build_url()
         except linkcheck.LinkCheckerError, msg:
-            self.set_result(unicode(str(msg)), valid=False)
+            self.set_result(linkcheck.strformat.unicode_safe(msg),
+                            valid=False)
             return False
         self.set_cache_keys()
         self.extern = self._get_extern(self.url)
@@ -305,7 +306,7 @@ class UrlBase (object):
         else:
             self.url = base_url
         # split into (modifiable) list
-        self.urlparts = linkcheck.url.url_unicode_split(self.url)
+        self.urlparts = linkcheck.strformat.url_unicode_split(self.url)
         # and unsplit again
         self.url = urlparse.urlunsplit(self.urlparts)
         # check userinfo@host:port syntax
@@ -375,7 +376,8 @@ class UrlBase (object):
             # make nicer error msg for bad status line
             if isinstance(evalue, linkcheck.httplib2.BadStatusLine):
                 evalue = _('Bad HTTP response %r') % str(evalue)
-            self.set_result(unicode(str(evalue)), valid=False)
+            self.set_result(linkcheck.strformat.unicode_safe(evalue),
+                            valid=False)
 
         # check content
         warningregex = self.consumer.config["warningregex"]
@@ -387,7 +389,8 @@ class UrlBase (object):
                 value, tb = sys.exc_info()[1:]
                 linkcheck.log.debug(linkcheck.LOG_CHECK, "exception %s",
                                     traceback.format_tb(tb))
-                self.set_result(unicode(str(value)), valid=False)
+                self.set_result(linkcheck.strformat.unicode_safe(value),
+                                valid=False)
 
         self.checktime = time.time() - t
         # check recursion
@@ -686,7 +689,7 @@ class UrlBase (object):
         """
         Return serialized url check data as unicode string.
         """
-        sep = unicode(os.linesep)
+        sep = linkcheck.strformat.unicode_safe(os.linesep)
         assert isinstance(self.base_url, unicode), self
         if self.parent_url is not None:
             assert isinstance(self.parent_url, unicode), self
diff --git a/linkcheck/strformat.py b/linkcheck/strformat.py
index 770d8da6..fecc84f9 100644
--- a/linkcheck/strformat.py
+++ b/linkcheck/strformat.py
@@ -24,6 +24,25 @@ import textwrap
 import sys
 import os
 import time
+import urlparse
+
+
+def unicode_safe (s, encoding="iso-8859-1"):
+    """
+    Return unicode string without raising encoding errors. Unknown
+    characters of the given encoding will be ignored.
+    """
+    assert s is not None, "argument to unicode_safe was None"
+    if isinstance(s, unicode):
+        return s
+    return unicode(s, encoding, "ignore")
+
+
+def url_unicode_split (url):
+    """
+    Like urlparse.urlsplit(), but always returning unicode parts.
+    """
+    return [unicode_safe(s) for s in urlparse.urlsplit(url)]
 
 
 def unquote (s):
diff --git a/linkcheck/url.py b/linkcheck/url.py
index 702381a9..ae4e3bf0 100644
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@@ -444,9 +444,3 @@ def url_split (url):
         host, port = urllib.splitnport(host, port)
     return scheme, host, port, document
 
-
-def url_unicode_split (url):
-    """
-    Like urlparse.urlsplit(), but always returning unicode parts.
-    """
-    return [unicode(s) for s in urlparse.urlsplit(url)]