safe decoding of non-unicode strings

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2322 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-02-18 11:22:52 +00:00
parent 8442a98508
commit 703da9eb65
6 changed files with 45 additions and 27 deletions

View file

@ -31,6 +31,7 @@ import nntplib
import ftplib
import linkcheck.httplib2
import linkcheck.strformat
import linkcheck.dns.exception
@ -177,7 +178,7 @@ def set_intern_url (url, klass, config):
elif klass in [linkcheck.checker.httpurl.HttpUrl,
linkcheck.checker.httpsurl.HttpsUrl,
linkcheck.checker.ftpurl.FtpUrl]:
domain = linkcheck.url.url_unicode_split(url)[1]
domain = linkcheck.strformat.url_unicode_split(url)[1]
domain, is_idn = linkcheck.url.idna_encode(domain)
if domain:
domain = "://%s" % re.escape(domain)
@ -234,15 +235,12 @@ def get_url_from (base_url, recursion_level, consumer,
@param cmdline: flag if url was given on command line
@type cmdline: bool
"""
default_encoding = "iso8859-15"
if not isinstance(base_url, unicode):
base_url = unicode(base_url, default_encoding, "ignore")
if parent_url is not None and not isinstance(parent_url, unicode):
parent_url = unicode(parent_url, default_encoding, "ignore")
if base_ref is not None and not isinstance(base_ref, unicode):
base_ref = unicode(base_ref, default_encoding, "ignore")
if not isinstance(name, unicode):
name = unicode(name, default_encoding, "ignore")
base_url = linkcheck.strformat.unicode_safe(base_url)
if parent_url is not None:
parent_url = linkcheck.strformat.unicode_safe(parent_url)
if base_ref is not None:
base_ref = linkcheck.strformat.unicode_safe(base_ref)
name = linkcheck.strformat.unicode_safe(name)
#if cmdline and linkcheck.url.url_needs_quoting(base_url):
# base_url = linkcheck.url.url_quote(base_url)
url = absolute_url(base_url, base_ref, parent_url)

View file

@ -113,7 +113,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
raise linkcheck.LinkCheckerError, \
_("Got no answer from FTP server")
# don't set info anymore, this may change every time we log in
#self.add_info(unicode(info))
#self.add_info(linkcheck.strformat.unicode_safe(info))
def cwd (self):
"""

View file

@ -29,6 +29,7 @@ import Cookie
import linkcheck
import linkcheck.url
import linkcheck.strformat
import linkcheck.robotparser2
import linkcheck.httplib2
import urlbase
@ -221,6 +222,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
continue
raise
self.headers = response.msg
if response.reason:
response.reason = \
linkcheck.strformat.unicode_safe(response.reason)
linkcheck.log.debug(linkcheck.LOG_CHECK, "Response: %s %s",
response.status, response.reason)
linkcheck.log.debug(linkcheck.LOG_CHECK, "Headers: %s",
@ -313,8 +317,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
self.headers.getheader("Uri", ""))
# make new url absolute and unicode
newurl = urlparse.urljoin(redirected, newurl)
if not isinstance(newurl, unicode):
newurl = unicode(newurl, "iso8859-1", "ignore")
newurl = linkcheck.strformat.unicode_safe(newurl)
linkcheck.log.debug(linkcheck.LOG_CHECK, "Redirected to %r",
newurl)
self.add_info(_("Redirected to %(url)s.") % {'url': newurl})
@ -346,7 +349,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
# remember redireced url as alias
self.aliases.append(redirected)
# note: urlparts has to be a list
self.urlparts = linkcheck.url.url_unicode_split(redirected)
self.urlparts = linkcheck.strformat.url_unicode_split(redirected)
if response.status == 301:
if not self.has301status:
self.add_warning(
@ -405,7 +408,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
" anchor from request.") % server)
if response.status == 204:
# no content
self.add_warning(unicode(response.reason))
self.add_warning(
linkcheck.strformat.unicode_safe(response.reason))
# store cookies for valid links
if self.consumer.config['cookies']:
for c in self.cookies:
@ -414,7 +418,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
out = self.consumer.cache.store_cookies(self.headers,
self.urlparts[1])
for h in out:
self.add_info(unicode(h))
self.add_info(linkcheck.strformat.unicode_safe(h))
except Cookie.CookieError, msg:
self.add_warning(_("Could not store cookies: %(msg)s.") %
{'msg': str(msg)})

View file

@ -271,7 +271,8 @@ class UrlBase (object):
try:
self.build_url()
except linkcheck.LinkCheckerError, msg:
self.set_result(unicode(str(msg)), valid=False)
self.set_result(linkcheck.strformat.unicode_safe(msg),
valid=False)
return False
self.set_cache_keys()
self.extern = self._get_extern(self.url)
@ -305,7 +306,7 @@ class UrlBase (object):
else:
self.url = base_url
# split into (modifiable) list
self.urlparts = linkcheck.url.url_unicode_split(self.url)
self.urlparts = linkcheck.strformat.url_unicode_split(self.url)
# and unsplit again
self.url = urlparse.urlunsplit(self.urlparts)
# check userinfo@host:port syntax
@ -375,7 +376,8 @@ class UrlBase (object):
# make nicer error msg for bad status line
if isinstance(evalue, linkcheck.httplib2.BadStatusLine):
evalue = _('Bad HTTP response %r') % str(evalue)
self.set_result(unicode(str(evalue)), valid=False)
self.set_result(linkcheck.strformat.unicode_safe(evalue),
valid=False)
# check content
warningregex = self.consumer.config["warningregex"]
@ -387,7 +389,8 @@ class UrlBase (object):
value, tb = sys.exc_info()[1:]
linkcheck.log.debug(linkcheck.LOG_CHECK, "exception %s",
traceback.format_tb(tb))
self.set_result(unicode(str(value)), valid=False)
self.set_result(linkcheck.strformat.unicode_safe(value),
valid=False)
self.checktime = time.time() - t
# check recursion
@ -686,7 +689,7 @@ class UrlBase (object):
"""
Return serialized url check data as unicode string.
"""
sep = unicode(os.linesep)
sep = linkcheck.strformat.unicode_safe(os.linesep)
assert isinstance(self.base_url, unicode), self
if self.parent_url is not None:
assert isinstance(self.parent_url, unicode), self

View file

@ -24,6 +24,25 @@ import textwrap
import sys
import os
import time
import urlparse
def unicode_safe (s, encoding="iso-8859-1"):
"""
Return unicode string without raising encoding errors. Unknown
characters of the given encoding will be ignored.
"""
assert s is not None, "argument to unicode_safe was None"
if isinstance(s, unicode):
return s
return unicode(s, encoding, "ignore")
def url_unicode_split (url):
"""
Like urlparse.urlsplit(), but always returning unicode parts.
"""
return [unicode_safe(s) for s in urlparse.urlsplit(url)]
def unquote (s):

View file

@ -444,9 +444,3 @@ def url_split (url):
host, port = urllib.splitnport(host, port)
return scheme, host, port, document
def url_unicode_split (url):
"""
Like urlparse.urlsplit(), but always returning unicode parts.
"""
return [unicode(s) for s in urlparse.urlsplit(url)]