mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-08 14:44:46 +00:00
safe decoding of non-unicode strings
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2322 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
8442a98508
commit
703da9eb65
6 changed files with 45 additions and 27 deletions
|
|
@ -31,6 +31,7 @@ import nntplib
|
|||
import ftplib
|
||||
|
||||
import linkcheck.httplib2
|
||||
import linkcheck.strformat
|
||||
import linkcheck.dns.exception
|
||||
|
||||
|
||||
|
|
@ -177,7 +178,7 @@ def set_intern_url (url, klass, config):
|
|||
elif klass in [linkcheck.checker.httpurl.HttpUrl,
|
||||
linkcheck.checker.httpsurl.HttpsUrl,
|
||||
linkcheck.checker.ftpurl.FtpUrl]:
|
||||
domain = linkcheck.url.url_unicode_split(url)[1]
|
||||
domain = linkcheck.strformat.url_unicode_split(url)[1]
|
||||
domain, is_idn = linkcheck.url.idna_encode(domain)
|
||||
if domain:
|
||||
domain = "://%s" % re.escape(domain)
|
||||
|
|
@ -234,15 +235,12 @@ def get_url_from (base_url, recursion_level, consumer,
|
|||
@param cmdline: flag if url was given on command line
|
||||
@type cmdline: bool
|
||||
"""
|
||||
default_encoding = "iso8859-15"
|
||||
if not isinstance(base_url, unicode):
|
||||
base_url = unicode(base_url, default_encoding, "ignore")
|
||||
if parent_url is not None and not isinstance(parent_url, unicode):
|
||||
parent_url = unicode(parent_url, default_encoding, "ignore")
|
||||
if base_ref is not None and not isinstance(base_ref, unicode):
|
||||
base_ref = unicode(base_ref, default_encoding, "ignore")
|
||||
if not isinstance(name, unicode):
|
||||
name = unicode(name, default_encoding, "ignore")
|
||||
base_url = linkcheck.strformat.unicode_safe(base_url)
|
||||
if parent_url is not None:
|
||||
parent_url = linkcheck.strformat.unicode_safe(parent_url)
|
||||
if base_ref is not None:
|
||||
base_ref = linkcheck.strformat.unicode_safe(base_ref)
|
||||
name = linkcheck.strformat.unicode_safe(name)
|
||||
#if cmdline and linkcheck.url.url_needs_quoting(base_url):
|
||||
# base_url = linkcheck.url.url_quote(base_url)
|
||||
url = absolute_url(base_url, base_ref, parent_url)
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
raise linkcheck.LinkCheckerError, \
|
||||
_("Got no answer from FTP server")
|
||||
# don't set info anymore, this may change every time we log in
|
||||
#self.add_info(unicode(info))
|
||||
#self.add_info(linkcheck.strformat.unicode_safe(info))
|
||||
|
||||
def cwd (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import Cookie
|
|||
|
||||
import linkcheck
|
||||
import linkcheck.url
|
||||
import linkcheck.strformat
|
||||
import linkcheck.robotparser2
|
||||
import linkcheck.httplib2
|
||||
import urlbase
|
||||
|
|
@ -221,6 +222,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
continue
|
||||
raise
|
||||
self.headers = response.msg
|
||||
if response.reason:
|
||||
response.reason = \
|
||||
linkcheck.strformat.unicode_safe(response.reason)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Response: %s %s",
|
||||
response.status, response.reason)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Headers: %s",
|
||||
|
|
@ -313,8 +317,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.headers.getheader("Uri", ""))
|
||||
# make new url absolute and unicode
|
||||
newurl = urlparse.urljoin(redirected, newurl)
|
||||
if not isinstance(newurl, unicode):
|
||||
newurl = unicode(newurl, "iso8859-1", "ignore")
|
||||
newurl = linkcheck.strformat.unicode_safe(newurl)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Redirected to %r",
|
||||
newurl)
|
||||
self.add_info(_("Redirected to %(url)s.") % {'url': newurl})
|
||||
|
|
@ -346,7 +349,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
# remember redireced url as alias
|
||||
self.aliases.append(redirected)
|
||||
# note: urlparts has to be a list
|
||||
self.urlparts = linkcheck.url.url_unicode_split(redirected)
|
||||
self.urlparts = linkcheck.strformat.url_unicode_split(redirected)
|
||||
if response.status == 301:
|
||||
if not self.has301status:
|
||||
self.add_warning(
|
||||
|
|
@ -405,7 +408,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
" anchor from request.") % server)
|
||||
if response.status == 204:
|
||||
# no content
|
||||
self.add_warning(unicode(response.reason))
|
||||
self.add_warning(
|
||||
linkcheck.strformat.unicode_safe(response.reason))
|
||||
# store cookies for valid links
|
||||
if self.consumer.config['cookies']:
|
||||
for c in self.cookies:
|
||||
|
|
@ -414,7 +418,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
out = self.consumer.cache.store_cookies(self.headers,
|
||||
self.urlparts[1])
|
||||
for h in out:
|
||||
self.add_info(unicode(h))
|
||||
self.add_info(linkcheck.strformat.unicode_safe(h))
|
||||
except Cookie.CookieError, msg:
|
||||
self.add_warning(_("Could not store cookies: %(msg)s.") %
|
||||
{'msg': str(msg)})
|
||||
|
|
|
|||
|
|
@ -271,7 +271,8 @@ class UrlBase (object):
|
|||
try:
|
||||
self.build_url()
|
||||
except linkcheck.LinkCheckerError, msg:
|
||||
self.set_result(unicode(str(msg)), valid=False)
|
||||
self.set_result(linkcheck.strformat.unicode_safe(msg),
|
||||
valid=False)
|
||||
return False
|
||||
self.set_cache_keys()
|
||||
self.extern = self._get_extern(self.url)
|
||||
|
|
@ -305,7 +306,7 @@ class UrlBase (object):
|
|||
else:
|
||||
self.url = base_url
|
||||
# split into (modifiable) list
|
||||
self.urlparts = linkcheck.url.url_unicode_split(self.url)
|
||||
self.urlparts = linkcheck.strformat.url_unicode_split(self.url)
|
||||
# and unsplit again
|
||||
self.url = urlparse.urlunsplit(self.urlparts)
|
||||
# check userinfo@host:port syntax
|
||||
|
|
@ -375,7 +376,8 @@ class UrlBase (object):
|
|||
# make nicer error msg for bad status line
|
||||
if isinstance(evalue, linkcheck.httplib2.BadStatusLine):
|
||||
evalue = _('Bad HTTP response %r') % str(evalue)
|
||||
self.set_result(unicode(str(evalue)), valid=False)
|
||||
self.set_result(linkcheck.strformat.unicode_safe(evalue),
|
||||
valid=False)
|
||||
|
||||
# check content
|
||||
warningregex = self.consumer.config["warningregex"]
|
||||
|
|
@ -387,7 +389,8 @@ class UrlBase (object):
|
|||
value, tb = sys.exc_info()[1:]
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "exception %s",
|
||||
traceback.format_tb(tb))
|
||||
self.set_result(unicode(str(value)), valid=False)
|
||||
self.set_result(linkcheck.strformat.unicode_safe(value),
|
||||
valid=False)
|
||||
|
||||
self.checktime = time.time() - t
|
||||
# check recursion
|
||||
|
|
@ -686,7 +689,7 @@ class UrlBase (object):
|
|||
"""
|
||||
Return serialized url check data as unicode string.
|
||||
"""
|
||||
sep = unicode(os.linesep)
|
||||
sep = linkcheck.strformat.unicode_safe(os.linesep)
|
||||
assert isinstance(self.base_url, unicode), self
|
||||
if self.parent_url is not None:
|
||||
assert isinstance(self.parent_url, unicode), self
|
||||
|
|
|
|||
|
|
@ -24,6 +24,25 @@ import textwrap
|
|||
import sys
|
||||
import os
|
||||
import time
|
||||
import urlparse
|
||||
|
||||
|
||||
def unicode_safe (s, encoding="iso-8859-1"):
|
||||
"""
|
||||
Return unicode string without raising encoding errors. Unknown
|
||||
characters of the given encoding will be ignored.
|
||||
"""
|
||||
assert s is not None, "argument to unicode_safe was None"
|
||||
if isinstance(s, unicode):
|
||||
return s
|
||||
return unicode(s, encoding, "ignore")
|
||||
|
||||
|
||||
def url_unicode_split (url):
|
||||
"""
|
||||
Like urlparse.urlsplit(), but always returning unicode parts.
|
||||
"""
|
||||
return [unicode_safe(s) for s in urlparse.urlsplit(url)]
|
||||
|
||||
|
||||
def unquote (s):
|
||||
|
|
|
|||
|
|
@ -444,9 +444,3 @@ def url_split (url):
|
|||
host, port = urllib.splitnport(host, port)
|
||||
return scheme, host, port, document
|
||||
|
||||
|
||||
def url_unicode_split (url):
|
||||
"""
|
||||
Like urlparse.urlsplit(), but always returning unicode parts.
|
||||
"""
|
||||
return [unicode(s) for s in urlparse.urlsplit(url)]
|
||||
|
|
|
|||
Loading…
Reference in a new issue