Add url encoding parameter

This commit is contained in:
Bastian Kleineidam 2009-11-28 11:56:35 +01:00
parent 84bcb84878
commit 77daf80e82
4 changed files with 68 additions and 51 deletions

View file

@ -89,7 +89,7 @@ class FileUrl (urlbase.UrlBase):
"""
def init (self, base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name):
aggregate, line, column, name, url_encoding):
"""
Besides the usual initialization the URL is normed according
to the platform:
@ -97,7 +97,7 @@ class FileUrl (urlbase.UrlBase):
- under Windows platform the drive specifier is normed
"""
super(FileUrl, self).init(base_ref, base_url, parent_url,
recursion_level, aggregate, line, column, name)
recursion_level, aggregate, line, column, name, url_encoding)
if self.base_url is None:
return
base_url = self.base_url
@ -112,12 +112,7 @@ class FileUrl (urlbase.UrlBase):
base_url = base_url.replace("\\", "/")
# transform c:/windows into /c|/windows
base_url = re.sub("^file://(/?)([a-zA-Z]):", r"file:///\2|", base_url)
# norm base url again after changing
if self.base_url != base_url:
base_url, is_idn = urlbase.url_norm(base_url)
if is_idn:
pass # XXX warn about idn use
self.base_url = unicode(base_url)
self.base_url = unicode(base_url)
def build_url (self):
"""

View file

@ -53,12 +53,11 @@ def urljoin (parent, url, scheme):
return urlparse.urljoin(parent, url)
def url_norm (url):
"""
Wrapper for url.url_norm() to convert UnicodeError in LinkCheckerError.
"""
def url_norm (url, encoding=None):
"""Wrapper for url.url_norm() to convert UnicodeError in
LinkCheckerError."""
try:
return urlutil.url_norm(url)
return urlutil.url_norm(url, encoding=encoding)
except UnicodeError:
msg = _("URL has unparsable domain name: %(name)s") % \
{"name": sys.exc_info()[1]}
@ -69,8 +68,8 @@ class UrlBase (object):
"""An URL with additional information like validity etc."""
def __init__ (self, base_url, recursion_level, aggregate,
parent_url = None, base_ref = None,
line = -1, column = -1, name = u""):
parent_url=None, base_ref=None, line=-1, column=-1,
name=u"", url_encoding=None):
"""
Initialize check data, and store given variables.
@ -82,15 +81,16 @@ class UrlBase (object):
@param line: line number of url in parent content
@param column: column number of url in parent content
@param name: name of url or empty
@param url_encoding: encoding of URL or None
"""
self.init(base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name)
aggregate, line, column, name, url_encoding)
self.reset()
self.check_syntax()
def init (self, base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name):
aggregate, line, column, name, url_encoding):
"""
Initialize internal data.
"""
@ -103,6 +103,7 @@ class UrlBase (object):
self.line = line
self.column = column
self.name = name
self.encoding = url_encoding
if self.base_ref:
assert not urlutil.url_needs_quoting(self.base_ref), \
"unquoted base reference URL %r" % self.base_ref
@ -338,7 +339,7 @@ class UrlBase (object):
url information self.base_url, self.parent_url and self.base_ref.
"""
# norm base url - can raise UnicodeError from url.idna_encode()
base_url, is_idn = url_norm(self.base_url)
base_url, is_idn = url_norm(self.base_url, self.encoding)
if is_idn:
self.add_warning(_("""URL %(url)r has a unicode domain name which
is not yet widely supported. You should use

View file

@ -25,6 +25,15 @@ import urllib
urlparse.uses_netloc.extend(('ldap', 'irc'))
# The character set to encode non-ASCII characters in a URL. See also
# http://tools.ietf.org/html/rfc2396#section-2.1
# Note that the encoding is not really specified, but most browsers
# encode in UTF-8 when no encoding is specified by the HTTP headers,
# else they use the page encoding for followed link. See als
# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
url_encoding = "utf-8"
# constants defining url part indexes
SCHEME = 0
HOSTNAME = DOMAIN = 1
@ -230,10 +239,12 @@ def url_fix_mailto_urlsplit (urlparts):
urlparts[2], urlparts[3] = urlparts[2].split('?', 1)
def url_parse_query (query):
def url_parse_query (query, encoding=None):
"""Parse and re-join the given CGI query."""
if isinstance(query, unicode):
query = query.encode('iso8859-1', 'ignore')
if encoding is None:
encoding = url_encoding
query = query.encode(encoding, 'ignore')
# if ? is in the query, split it off, seen at msdn.microsoft.com
if '?' in query:
query, append = query.split('?', 1)
@ -254,13 +265,20 @@ def url_parse_query (query):
return ''.join(l) + append
def url_norm (url):
def url_norm (url, encoding=None):
"""Normalize the given URL which must be quoted. Supports unicode
hostnames (IDNA encoding) according to RFC 3490.
@return: (normed url, idna flag)
@rtype: tuple of length two
"""
if isinstance(url, unicode):
# try to decode the URL to ascii since urllib.unquote()
# handles non-unicode strings differently
try:
url = url.encode('ascii')
except UnicodeEncodeError:
pass
urlparts = list(urlparse.urlsplit(url))
# scheme
urlparts[0] = urllib.unquote(urlparts[0]).lower()
@ -270,7 +288,7 @@ def url_norm (url):
# host (with path or query side effects)
is_idn = url_fix_host(urlparts)
# query
urlparts[3] = url_parse_query(urlparts[3])
urlparts[3] = url_parse_query(urlparts[3], encoding=encoding)
is_hierarchical = urlparts[0] not in urlparse.non_hierarchical
if is_hierarchical:
# URL has a hierarchical path we should norm
@ -286,10 +304,10 @@ def url_norm (url):
# anchor
urlparts[4] = urllib.unquote(urlparts[4])
# quote parts again
urlparts[0] = url_quote_part(urlparts[0]) # scheme
urlparts[1] = url_quote_part(urlparts[1], '@:') # host
urlparts[2] = url_quote_part(urlparts[2], _nopathquote_chars) # path
urlparts[4] = url_quote_part(urlparts[4]) # anchor
urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path
urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor
res = urlparse.urlunsplit(urlparts)
if url.endswith('#') and not urlparts[4]:
# re-append trailing empty fragment
@ -362,12 +380,13 @@ def url_quote (url):
return urlparse.urlunsplit(urlparts)
def url_quote_part (s, safechars='/'):
def url_quote_part (s, safechars='/', encoding=None):
"""Wrap urllib.quote() to support unicode strings. A unicode string
is first converted to ISO-8859-1, invalid characters are ignored.
After that urllib.quote() is called."""
is first converted to UTF-8. After that urllib.quote() is called."""
if isinstance(s, unicode):
s = s.encode("iso-8859-1", "ignore")
if encoding is None:
encoding = url_encoding
s = s.encode(encoding, 'ignore')
return urllib.quote(s, safechars)
def document_quote (document):

View file

@ -37,29 +37,20 @@ import linkcheck.url
# (Latin capital letter C + Combining cedilla U+0327)
def url_norm (url):
return linkcheck.url.url_norm(url)[0]
def url_norm (url, encoding=None):
return linkcheck.url.url_norm(url, encoding=encoding)[0]
class TestUrl (unittest.TestCase):
"""Test url norming and quoting."""
def urlnormtest (self, url, nurl):
self.assertFalse(linkcheck.url.url_needs_quoting(nurl))
nurl1 = url_norm(url)
self.assertFalse(linkcheck.url.url_needs_quoting(nurl1))
def urlnormtest (self, url, nurl, encoding=None):
self.assertFalse(linkcheck.url.url_needs_quoting(nurl),
"Result URL %r must not need quoting" % nurl)
nurl1 = url_norm(url, encoding=encoding)
self.assertFalse(linkcheck.url.url_needs_quoting(nurl1),
"Normed URL %r needs quoting" % nurl)
self.assertEquals(nurl1, nurl)
# Test with non-Unicode URLs
try:
cs = "iso8859-1"
url = url.decode(cs)
nurl = nurl.decode(cs)
nurl1 = url_norm(url)
self.assertFalse(linkcheck.url.url_needs_quoting(nurl1))
self.assertEquals(nurl1, nurl)
except UnicodeEncodeError:
# Ignore non-Latin1 URLs
pass
def test_pathattack (self):
# Windows winamp path attack prevention.
@ -147,7 +138,7 @@ class TestUrl (unittest.TestCase):
self.urlnormtest(url, nurl)
url = "http://localhost:8001/?quoted=ü"
nurl = "http://localhost:8001/?quoted=%FC"
self.urlnormtest(url, nurl)
self.urlnormtest(url, nurl, encoding="iso-8859-1")
url = "http://host/?a=b/c+d="
nurl = "http://host/?a=b%2Fc%20d%3D"
self.urlnormtest(url, nurl)
@ -367,8 +358,8 @@ class TestUrl (unittest.TestCase):
url = 'nntp:'
nurl = 'nntp://'
self.urlnormtest(url, nurl)
url = "news:§$%&/´%"
nurl = 'news:%A7%24%25%26/%B4%60%A7%25'
url = "news:!$%&/()="
nurl = 'news:!%24%25%26/()='
self.urlnormtest(url, nurl)
url = "news:comp.infosystems.www.servers.unix"
nurl = url
@ -410,10 +401,21 @@ class TestUrl (unittest.TestCase):
nurl = "file://c%7C/a/b.txt"
self.urlnormtest(url, nurl)
def test_norm_file_unicode (self):
url = u"file:///a/b.txt"
nurl = url
self.urlnormtest(url, nurl)
url = u"file:///a/ä.txt"
nurl = u"file:///a/%E4.txt"
self.urlnormtest(url, nurl, encoding="iso-8859-1")
#url = u"file:///\u041c\u043e\u0448\u043a\u043e\u0432\u0430.bin"
#nurl = u"file:///a.bin" # XXX
#self.urlnormtest(url, nurl)
def test_norm_invalid (self):
url = u"äöü?:"
nurl = u"%E4%F6%FC?:"
self.urlnormtest(url, nurl)
self.urlnormtest(url, nurl, encoding="iso-8859-1")
def test_fixing (self):
# Test url fix method.