From 703da9eb65414ca17966b244d40d9295317049a0 Mon Sep 17 00:00:00 2001 From: calvin Date: Fri, 18 Feb 2005 11:22:52 +0000 Subject: [PATCH] safe decoding of non-unicode strings git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2322 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/checker/__init__.py | 18 ++++++++---------- linkcheck/checker/ftpurl.py | 2 +- linkcheck/checker/httpurl.py | 14 +++++++++----- linkcheck/checker/urlbase.py | 13 ++++++++----- linkcheck/strformat.py | 19 +++++++++++++++++++ linkcheck/url.py | 6 ------ 6 files changed, 45 insertions(+), 27 deletions(-) diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 4dbcad63..800dc55a 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -31,6 +31,7 @@ import nntplib import ftplib import linkcheck.httplib2 +import linkcheck.strformat import linkcheck.dns.exception @@ -177,7 +178,7 @@ def set_intern_url (url, klass, config): elif klass in [linkcheck.checker.httpurl.HttpUrl, linkcheck.checker.httpsurl.HttpsUrl, linkcheck.checker.ftpurl.FtpUrl]: - domain = linkcheck.url.url_unicode_split(url)[1] + domain = linkcheck.strformat.url_unicode_split(url)[1] domain, is_idn = linkcheck.url.idna_encode(domain) if domain: domain = "://%s" % re.escape(domain) @@ -234,15 +235,12 @@ def get_url_from (base_url, recursion_level, consumer, @param cmdline: flag if url was given on command line @type cmdline: bool """ - default_encoding = "iso8859-15" - if not isinstance(base_url, unicode): - base_url = unicode(base_url, default_encoding, "ignore") - if parent_url is not None and not isinstance(parent_url, unicode): - parent_url = unicode(parent_url, default_encoding, "ignore") - if base_ref is not None and not isinstance(base_ref, unicode): - base_ref = unicode(base_ref, default_encoding, "ignore") - if not isinstance(name, unicode): - name = unicode(name, default_encoding, "ignore") + base_url = linkcheck.strformat.unicode_safe(base_url) + if parent_url is not None: + parent_url = linkcheck.strformat.unicode_safe(parent_url) + if base_ref is not None: + base_ref = linkcheck.strformat.unicode_safe(base_ref) + name = linkcheck.strformat.unicode_safe(name) #if cmdline and linkcheck.url.url_needs_quoting(base_url): # base_url = linkcheck.url.url_quote(base_url) url = absolute_url(base_url, base_ref, parent_url) diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index 8ff813f1..ee52b070 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -113,7 +113,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport): raise linkcheck.LinkCheckerError, \ _("Got no answer from FTP server") # don't set info anymore, this may change every time we log in - #self.add_info(unicode(info)) + #self.add_info(linkcheck.strformat.unicode_safe(info)) def cwd (self): """ diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 2b3cf5c7..529d91a6 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -29,6 +29,7 @@ import Cookie import linkcheck import linkcheck.url +import linkcheck.strformat import linkcheck.robotparser2 import linkcheck.httplib2 import urlbase @@ -221,6 +222,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): continue raise self.headers = response.msg + if response.reason: + response.reason = \ + linkcheck.strformat.unicode_safe(response.reason) linkcheck.log.debug(linkcheck.LOG_CHECK, "Response: %s %s", response.status, response.reason) linkcheck.log.debug(linkcheck.LOG_CHECK, "Headers: %s", @@ -313,8 +317,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): self.headers.getheader("Uri", "")) # make new url absolute and unicode newurl = urlparse.urljoin(redirected, newurl) - if not isinstance(newurl, unicode): - newurl = unicode(newurl, "iso8859-1", "ignore") + newurl = linkcheck.strformat.unicode_safe(newurl) linkcheck.log.debug(linkcheck.LOG_CHECK, "Redirected to %r", newurl) self.add_info(_("Redirected to %(url)s.") % {'url': newurl}) @@ -346,7 +349,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): # remember redireced url as alias self.aliases.append(redirected) # note: urlparts has to be a list - self.urlparts = linkcheck.url.url_unicode_split(redirected) + self.urlparts = linkcheck.strformat.url_unicode_split(redirected) if response.status == 301: if not self.has301status: self.add_warning( @@ -405,7 +408,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): " anchor from request.") % server) if response.status == 204: # no content - self.add_warning(unicode(response.reason)) + self.add_warning( + linkcheck.strformat.unicode_safe(response.reason)) # store cookies for valid links if self.consumer.config['cookies']: for c in self.cookies: @@ -414,7 +418,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): out = self.consumer.cache.store_cookies(self.headers, self.urlparts[1]) for h in out: - self.add_info(unicode(h)) + self.add_info(linkcheck.strformat.unicode_safe(h)) except Cookie.CookieError, msg: self.add_warning(_("Could not store cookies: %(msg)s.") % {'msg': str(msg)}) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 3b8979e9..f8dd6c73 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -271,7 +271,8 @@ class UrlBase (object): try: self.build_url() except linkcheck.LinkCheckerError, msg: - self.set_result(unicode(str(msg)), valid=False) + self.set_result(linkcheck.strformat.unicode_safe(msg), + valid=False) return False self.set_cache_keys() self.extern = self._get_extern(self.url) @@ -305,7 +306,7 @@ class UrlBase (object): else: self.url = base_url # split into (modifiable) list - self.urlparts = linkcheck.url.url_unicode_split(self.url) + self.urlparts = linkcheck.strformat.url_unicode_split(self.url) # and unsplit again self.url = urlparse.urlunsplit(self.urlparts) # check userinfo@host:port syntax @@ -375,7 +376,8 @@ class UrlBase (object): # make nicer error msg for bad status line if isinstance(evalue, linkcheck.httplib2.BadStatusLine): evalue = _('Bad HTTP response %r') % str(evalue) - self.set_result(unicode(str(evalue)), valid=False) + self.set_result(linkcheck.strformat.unicode_safe(evalue), + valid=False) # check content warningregex = self.consumer.config["warningregex"] @@ -387,7 +389,8 @@ class UrlBase (object): value, tb = sys.exc_info()[1:] linkcheck.log.debug(linkcheck.LOG_CHECK, "exception %s", traceback.format_tb(tb)) - self.set_result(unicode(str(value)), valid=False) + self.set_result(linkcheck.strformat.unicode_safe(value), + valid=False) self.checktime = time.time() - t # check recursion @@ -686,7 +689,7 @@ class UrlBase (object): """ Return serialized url check data as unicode string. """ - sep = unicode(os.linesep) + sep = linkcheck.strformat.unicode_safe(os.linesep) assert isinstance(self.base_url, unicode), self if self.parent_url is not None: assert isinstance(self.parent_url, unicode), self diff --git a/linkcheck/strformat.py b/linkcheck/strformat.py index 770d8da6..fecc84f9 100644 --- a/linkcheck/strformat.py +++ b/linkcheck/strformat.py @@ -24,6 +24,25 @@ import textwrap import sys import os import time +import urlparse + + +def unicode_safe (s, encoding="iso-8859-1"): + """ + Return unicode string without raising encoding errors. Unknown + characters of the given encoding will be ignored. + """ + assert s is not None, "argument to unicode_safe was None" + if isinstance(s, unicode): + return s + return unicode(s, encoding, "ignore") + + +def url_unicode_split (url): + """ + Like urlparse.urlsplit(), but always returning unicode parts. + """ + return [unicode_safe(s) for s in urlparse.urlsplit(url)] def unquote (s): diff --git a/linkcheck/url.py b/linkcheck/url.py index 702381a9..ae4e3bf0 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -444,9 +444,3 @@ def url_split (url): host, port = urllib.splitnport(host, port) return scheme, host, port, document - -def url_unicode_split (url): - """ - Like urlparse.urlsplit(), but always returning unicode parts. - """ - return [unicode(s) for s in urlparse.urlsplit(url)]