mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-13 00:53:11 +00:00
Add url encoding parameter
This commit is contained in:
parent
84bcb84878
commit
77daf80e82
4 changed files with 68 additions and 51 deletions
|
|
@ -89,7 +89,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
"""
|
||||
|
||||
def init (self, base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, name):
|
||||
aggregate, line, column, name, url_encoding):
|
||||
"""
|
||||
Besides the usual initialization the URL is normed according
|
||||
to the platform:
|
||||
|
|
@ -97,7 +97,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
- under Windows platform the drive specifier is normed
|
||||
"""
|
||||
super(FileUrl, self).init(base_ref, base_url, parent_url,
|
||||
recursion_level, aggregate, line, column, name)
|
||||
recursion_level, aggregate, line, column, name, url_encoding)
|
||||
if self.base_url is None:
|
||||
return
|
||||
base_url = self.base_url
|
||||
|
|
@ -112,12 +112,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
base_url = base_url.replace("\\", "/")
|
||||
# transform c:/windows into /c|/windows
|
||||
base_url = re.sub("^file://(/?)([a-zA-Z]):", r"file:///\2|", base_url)
|
||||
# norm base url again after changing
|
||||
if self.base_url != base_url:
|
||||
base_url, is_idn = urlbase.url_norm(base_url)
|
||||
if is_idn:
|
||||
pass # XXX warn about idn use
|
||||
self.base_url = unicode(base_url)
|
||||
self.base_url = unicode(base_url)
|
||||
|
||||
def build_url (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -53,12 +53,11 @@ def urljoin (parent, url, scheme):
|
|||
return urlparse.urljoin(parent, url)
|
||||
|
||||
|
||||
def url_norm (url):
|
||||
"""
|
||||
Wrapper for url.url_norm() to convert UnicodeError in LinkCheckerError.
|
||||
"""
|
||||
def url_norm (url, encoding=None):
|
||||
"""Wrapper for url.url_norm() to convert UnicodeError in
|
||||
LinkCheckerError."""
|
||||
try:
|
||||
return urlutil.url_norm(url)
|
||||
return urlutil.url_norm(url, encoding=encoding)
|
||||
except UnicodeError:
|
||||
msg = _("URL has unparsable domain name: %(name)s") % \
|
||||
{"name": sys.exc_info()[1]}
|
||||
|
|
@ -69,8 +68,8 @@ class UrlBase (object):
|
|||
"""An URL with additional information like validity etc."""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, aggregate,
|
||||
parent_url = None, base_ref = None,
|
||||
line = -1, column = -1, name = u""):
|
||||
parent_url=None, base_ref=None, line=-1, column=-1,
|
||||
name=u"", url_encoding=None):
|
||||
"""
|
||||
Initialize check data, and store given variables.
|
||||
|
||||
|
|
@ -82,15 +81,16 @@ class UrlBase (object):
|
|||
@param line: line number of url in parent content
|
||||
@param column: column number of url in parent content
|
||||
@param name: name of url or empty
|
||||
@param url_encoding: encoding of URL or None
|
||||
"""
|
||||
self.init(base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, name)
|
||||
aggregate, line, column, name, url_encoding)
|
||||
self.reset()
|
||||
self.check_syntax()
|
||||
|
||||
|
||||
def init (self, base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, name):
|
||||
aggregate, line, column, name, url_encoding):
|
||||
"""
|
||||
Initialize internal data.
|
||||
"""
|
||||
|
|
@ -103,6 +103,7 @@ class UrlBase (object):
|
|||
self.line = line
|
||||
self.column = column
|
||||
self.name = name
|
||||
self.encoding = url_encoding
|
||||
if self.base_ref:
|
||||
assert not urlutil.url_needs_quoting(self.base_ref), \
|
||||
"unquoted base reference URL %r" % self.base_ref
|
||||
|
|
@ -338,7 +339,7 @@ class UrlBase (object):
|
|||
url information self.base_url, self.parent_url and self.base_ref.
|
||||
"""
|
||||
# norm base url - can raise UnicodeError from url.idna_encode()
|
||||
base_url, is_idn = url_norm(self.base_url)
|
||||
base_url, is_idn = url_norm(self.base_url, self.encoding)
|
||||
if is_idn:
|
||||
self.add_warning(_("""URL %(url)r has a unicode domain name which
|
||||
is not yet widely supported. You should use
|
||||
|
|
|
|||
|
|
@ -25,6 +25,15 @@ import urllib
|
|||
|
||||
urlparse.uses_netloc.extend(('ldap', 'irc'))
|
||||
|
||||
# The character set to encode non-ASCII characters in a URL. See also
|
||||
# http://tools.ietf.org/html/rfc2396#section-2.1
|
||||
# Note that the encoding is not really specified, but most browsers
|
||||
# encode in UTF-8 when no encoding is specified by the HTTP headers,
|
||||
# else they use the page encoding for followed link. See als
|
||||
# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
|
||||
url_encoding = "utf-8"
|
||||
|
||||
|
||||
# constants defining url part indexes
|
||||
SCHEME = 0
|
||||
HOSTNAME = DOMAIN = 1
|
||||
|
|
@ -230,10 +239,12 @@ def url_fix_mailto_urlsplit (urlparts):
|
|||
urlparts[2], urlparts[3] = urlparts[2].split('?', 1)
|
||||
|
||||
|
||||
def url_parse_query (query):
|
||||
def url_parse_query (query, encoding=None):
|
||||
"""Parse and re-join the given CGI query."""
|
||||
if isinstance(query, unicode):
|
||||
query = query.encode('iso8859-1', 'ignore')
|
||||
if encoding is None:
|
||||
encoding = url_encoding
|
||||
query = query.encode(encoding, 'ignore')
|
||||
# if ? is in the query, split it off, seen at msdn.microsoft.com
|
||||
if '?' in query:
|
||||
query, append = query.split('?', 1)
|
||||
|
|
@ -254,13 +265,20 @@ def url_parse_query (query):
|
|||
return ''.join(l) + append
|
||||
|
||||
|
||||
def url_norm (url):
|
||||
def url_norm (url, encoding=None):
|
||||
"""Normalize the given URL which must be quoted. Supports unicode
|
||||
hostnames (IDNA encoding) according to RFC 3490.
|
||||
|
||||
@return: (normed url, idna flag)
|
||||
@rtype: tuple of length two
|
||||
"""
|
||||
if isinstance(url, unicode):
|
||||
# try to decode the URL to ascii since urllib.unquote()
|
||||
# handles non-unicode strings differently
|
||||
try:
|
||||
url = url.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
# scheme
|
||||
urlparts[0] = urllib.unquote(urlparts[0]).lower()
|
||||
|
|
@ -270,7 +288,7 @@ def url_norm (url):
|
|||
# host (with path or query side effects)
|
||||
is_idn = url_fix_host(urlparts)
|
||||
# query
|
||||
urlparts[3] = url_parse_query(urlparts[3])
|
||||
urlparts[3] = url_parse_query(urlparts[3], encoding=encoding)
|
||||
is_hierarchical = urlparts[0] not in urlparse.non_hierarchical
|
||||
if is_hierarchical:
|
||||
# URL has a hierarchical path we should norm
|
||||
|
|
@ -286,10 +304,10 @@ def url_norm (url):
|
|||
# anchor
|
||||
urlparts[4] = urllib.unquote(urlparts[4])
|
||||
# quote parts again
|
||||
urlparts[0] = url_quote_part(urlparts[0]) # scheme
|
||||
urlparts[1] = url_quote_part(urlparts[1], '@:') # host
|
||||
urlparts[2] = url_quote_part(urlparts[2], _nopathquote_chars) # path
|
||||
urlparts[4] = url_quote_part(urlparts[4]) # anchor
|
||||
urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
|
||||
urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
|
||||
urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path
|
||||
urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor
|
||||
res = urlparse.urlunsplit(urlparts)
|
||||
if url.endswith('#') and not urlparts[4]:
|
||||
# re-append trailing empty fragment
|
||||
|
|
@ -362,12 +380,13 @@ def url_quote (url):
|
|||
return urlparse.urlunsplit(urlparts)
|
||||
|
||||
|
||||
def url_quote_part (s, safechars='/'):
|
||||
def url_quote_part (s, safechars='/', encoding=None):
|
||||
"""Wrap urllib.quote() to support unicode strings. A unicode string
|
||||
is first converted to ISO-8859-1, invalid characters are ignored.
|
||||
After that urllib.quote() is called."""
|
||||
is first converted to UTF-8. After that urllib.quote() is called."""
|
||||
if isinstance(s, unicode):
|
||||
s = s.encode("iso-8859-1", "ignore")
|
||||
if encoding is None:
|
||||
encoding = url_encoding
|
||||
s = s.encode(encoding, 'ignore')
|
||||
return urllib.quote(s, safechars)
|
||||
|
||||
def document_quote (document):
|
||||
|
|
|
|||
|
|
@ -37,29 +37,20 @@ import linkcheck.url
|
|||
# (Latin capital letter C + Combining cedilla U+0327)
|
||||
|
||||
|
||||
def url_norm (url):
|
||||
return linkcheck.url.url_norm(url)[0]
|
||||
def url_norm (url, encoding=None):
|
||||
return linkcheck.url.url_norm(url, encoding=encoding)[0]
|
||||
|
||||
|
||||
class TestUrl (unittest.TestCase):
|
||||
"""Test url norming and quoting."""
|
||||
|
||||
def urlnormtest (self, url, nurl):
|
||||
self.assertFalse(linkcheck.url.url_needs_quoting(nurl))
|
||||
nurl1 = url_norm(url)
|
||||
self.assertFalse(linkcheck.url.url_needs_quoting(nurl1))
|
||||
def urlnormtest (self, url, nurl, encoding=None):
|
||||
self.assertFalse(linkcheck.url.url_needs_quoting(nurl),
|
||||
"Result URL %r must not need quoting" % nurl)
|
||||
nurl1 = url_norm(url, encoding=encoding)
|
||||
self.assertFalse(linkcheck.url.url_needs_quoting(nurl1),
|
||||
"Normed URL %r needs quoting" % nurl)
|
||||
self.assertEquals(nurl1, nurl)
|
||||
# Test with non-Unicode URLs
|
||||
try:
|
||||
cs = "iso8859-1"
|
||||
url = url.decode(cs)
|
||||
nurl = nurl.decode(cs)
|
||||
nurl1 = url_norm(url)
|
||||
self.assertFalse(linkcheck.url.url_needs_quoting(nurl1))
|
||||
self.assertEquals(nurl1, nurl)
|
||||
except UnicodeEncodeError:
|
||||
# Ignore non-Latin1 URLs
|
||||
pass
|
||||
|
||||
def test_pathattack (self):
|
||||
# Windows winamp path attack prevention.
|
||||
|
|
@ -147,7 +138,7 @@ class TestUrl (unittest.TestCase):
|
|||
self.urlnormtest(url, nurl)
|
||||
url = "http://localhost:8001/?quoted=ü"
|
||||
nurl = "http://localhost:8001/?quoted=%FC"
|
||||
self.urlnormtest(url, nurl)
|
||||
self.urlnormtest(url, nurl, encoding="iso-8859-1")
|
||||
url = "http://host/?a=b/c+d="
|
||||
nurl = "http://host/?a=b%2Fc%20d%3D"
|
||||
self.urlnormtest(url, nurl)
|
||||
|
|
@ -367,8 +358,8 @@ class TestUrl (unittest.TestCase):
|
|||
url = 'nntp:'
|
||||
nurl = 'nntp://'
|
||||
self.urlnormtest(url, nurl)
|
||||
url = "news:§$%&/´`§%"
|
||||
nurl = 'news:%A7%24%25%26/%B4%60%A7%25'
|
||||
url = "news:!$%&/()="
|
||||
nurl = 'news:!%24%25%26/()='
|
||||
self.urlnormtest(url, nurl)
|
||||
url = "news:comp.infosystems.www.servers.unix"
|
||||
nurl = url
|
||||
|
|
@ -410,10 +401,21 @@ class TestUrl (unittest.TestCase):
|
|||
nurl = "file://c%7C/a/b.txt"
|
||||
self.urlnormtest(url, nurl)
|
||||
|
||||
def test_norm_file_unicode (self):
|
||||
url = u"file:///a/b.txt"
|
||||
nurl = url
|
||||
self.urlnormtest(url, nurl)
|
||||
url = u"file:///a/ä.txt"
|
||||
nurl = u"file:///a/%E4.txt"
|
||||
self.urlnormtest(url, nurl, encoding="iso-8859-1")
|
||||
#url = u"file:///\u041c\u043e\u0448\u043a\u043e\u0432\u0430.bin"
|
||||
#nurl = u"file:///a.bin" # XXX
|
||||
#self.urlnormtest(url, nurl)
|
||||
|
||||
def test_norm_invalid (self):
|
||||
url = u"äöü?:"
|
||||
nurl = u"%E4%F6%FC?:"
|
||||
self.urlnormtest(url, nurl)
|
||||
self.urlnormtest(url, nurl, encoding="iso-8859-1")
|
||||
|
||||
def test_fixing (self):
|
||||
# Test url fix method.
|
||||
|
|
|
|||
Loading…
Reference in a new issue