diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 2cfd5cd1..3dec8c43 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -89,7 +89,7 @@ class FileUrl (urlbase.UrlBase): """ def init (self, base_ref, base_url, parent_url, recursion_level, - aggregate, line, column, name): + aggregate, line, column, name, url_encoding): """ Besides the usual initialization the URL is normed according to the platform: @@ -97,7 +97,7 @@ class FileUrl (urlbase.UrlBase): - under Windows platform the drive specifier is normed """ super(FileUrl, self).init(base_ref, base_url, parent_url, - recursion_level, aggregate, line, column, name) + recursion_level, aggregate, line, column, name, url_encoding) if self.base_url is None: return base_url = self.base_url @@ -112,12 +112,7 @@ class FileUrl (urlbase.UrlBase): base_url = base_url.replace("\\", "/") # transform c:/windows into /c|/windows base_url = re.sub("^file://(/?)([a-zA-Z]):", r"file:///\2|", base_url) - # norm base url again after changing - if self.base_url != base_url: - base_url, is_idn = urlbase.url_norm(base_url) - if is_idn: - pass # XXX warn about idn use - self.base_url = unicode(base_url) + self.base_url = unicode(base_url) def build_url (self): """ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index f99e3dff..5e04dccd 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -53,12 +53,11 @@ def urljoin (parent, url, scheme): return urlparse.urljoin(parent, url) -def url_norm (url): - """ - Wrapper for url.url_norm() to convert UnicodeError in LinkCheckerError. - """ +def url_norm (url, encoding=None): + """Wrapper for url.url_norm() to convert UnicodeError in + LinkCheckerError.""" try: - return urlutil.url_norm(url) + return urlutil.url_norm(url, encoding=encoding) except UnicodeError: msg = _("URL has unparsable domain name: %(name)s") % \ {"name": sys.exc_info()[1]} @@ -69,8 +68,8 @@ class UrlBase (object): """An URL with additional information like validity etc.""" def __init__ (self, base_url, recursion_level, aggregate, - parent_url = None, base_ref = None, - line = -1, column = -1, name = u""): + parent_url=None, base_ref=None, line=-1, column=-1, + name=u"", url_encoding=None): """ Initialize check data, and store given variables. @@ -82,15 +81,16 @@ class UrlBase (object): @param line: line number of url in parent content @param column: column number of url in parent content @param name: name of url or empty + @param url_encoding: encoding of URL or None """ self.init(base_ref, base_url, parent_url, recursion_level, - aggregate, line, column, name) + aggregate, line, column, name, url_encoding) self.reset() self.check_syntax() def init (self, base_ref, base_url, parent_url, recursion_level, - aggregate, line, column, name): + aggregate, line, column, name, url_encoding): """ Initialize internal data. """ @@ -103,6 +103,7 @@ class UrlBase (object): self.line = line self.column = column self.name = name + self.encoding = url_encoding if self.base_ref: assert not urlutil.url_needs_quoting(self.base_ref), \ "unquoted base reference URL %r" % self.base_ref @@ -338,7 +339,7 @@ class UrlBase (object): url information self.base_url, self.parent_url and self.base_ref. """ # norm base url - can raise UnicodeError from url.idna_encode() - base_url, is_idn = url_norm(self.base_url) + base_url, is_idn = url_norm(self.base_url, self.encoding) if is_idn: self.add_warning(_("""URL %(url)r has a unicode domain name which is not yet widely supported. You should use diff --git a/linkcheck/url.py b/linkcheck/url.py index 1f6be415..ee7463ba 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -25,6 +25,15 @@ import urllib urlparse.uses_netloc.extend(('ldap', 'irc')) +# The character set to encode non-ASCII characters in a URL. See also +# http://tools.ietf.org/html/rfc2396#section-2.1 +# Note that the encoding is not really specified, but most browsers +# encode in UTF-8 when no encoding is specified by the HTTP headers, +# else they use the page encoding for followed link. See als +# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs +url_encoding = "utf-8" + + # constants defining url part indexes SCHEME = 0 HOSTNAME = DOMAIN = 1 @@ -230,10 +239,12 @@ def url_fix_mailto_urlsplit (urlparts): urlparts[2], urlparts[3] = urlparts[2].split('?', 1) -def url_parse_query (query): +def url_parse_query (query, encoding=None): """Parse and re-join the given CGI query.""" if isinstance(query, unicode): - query = query.encode('iso8859-1', 'ignore') + if encoding is None: + encoding = url_encoding + query = query.encode(encoding, 'ignore') # if ? is in the query, split it off, seen at msdn.microsoft.com if '?' in query: query, append = query.split('?', 1) @@ -254,13 +265,20 @@ def url_parse_query (query): return ''.join(l) + append -def url_norm (url): +def url_norm (url, encoding=None): """Normalize the given URL which must be quoted. Supports unicode hostnames (IDNA encoding) according to RFC 3490. @return: (normed url, idna flag) @rtype: tuple of length two """ + if isinstance(url, unicode): + # try to decode the URL to ascii since urllib.unquote() + # handles non-unicode strings differently + try: + url = url.encode('ascii') + except UnicodeEncodeError: + pass urlparts = list(urlparse.urlsplit(url)) # scheme urlparts[0] = urllib.unquote(urlparts[0]).lower() @@ -270,7 +288,7 @@ def url_norm (url): # host (with path or query side effects) is_idn = url_fix_host(urlparts) # query - urlparts[3] = url_parse_query(urlparts[3]) + urlparts[3] = url_parse_query(urlparts[3], encoding=encoding) is_hierarchical = urlparts[0] not in urlparse.non_hierarchical if is_hierarchical: # URL has a hierarchical path we should norm @@ -286,10 +304,10 @@ def url_norm (url): # anchor urlparts[4] = urllib.unquote(urlparts[4]) # quote parts again - urlparts[0] = url_quote_part(urlparts[0]) # scheme - urlparts[1] = url_quote_part(urlparts[1], '@:') # host - urlparts[2] = url_quote_part(urlparts[2], _nopathquote_chars) # path - urlparts[4] = url_quote_part(urlparts[4]) # anchor + urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme + urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host + urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path + urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor res = urlparse.urlunsplit(urlparts) if url.endswith('#') and not urlparts[4]: # re-append trailing empty fragment @@ -362,12 +380,13 @@ def url_quote (url): return urlparse.urlunsplit(urlparts) -def url_quote_part (s, safechars='/'): +def url_quote_part (s, safechars='/', encoding=None): """Wrap urllib.quote() to support unicode strings. A unicode string - is first converted to ISO-8859-1, invalid characters are ignored. - After that urllib.quote() is called.""" + is first converted to UTF-8. After that urllib.quote() is called.""" if isinstance(s, unicode): - s = s.encode("iso-8859-1", "ignore") + if encoding is None: + encoding = url_encoding + s = s.encode(encoding, 'ignore') return urllib.quote(s, safechars) def document_quote (document): diff --git a/tests/test_url.py b/tests/test_url.py index 0e4c0b3a..8dba8367 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -37,29 +37,20 @@ import linkcheck.url # (Latin capital letter C + Combining cedilla U+0327) -def url_norm (url): - return linkcheck.url.url_norm(url)[0] +def url_norm (url, encoding=None): + return linkcheck.url.url_norm(url, encoding=encoding)[0] class TestUrl (unittest.TestCase): """Test url norming and quoting.""" - def urlnormtest (self, url, nurl): - self.assertFalse(linkcheck.url.url_needs_quoting(nurl)) - nurl1 = url_norm(url) - self.assertFalse(linkcheck.url.url_needs_quoting(nurl1)) + def urlnormtest (self, url, nurl, encoding=None): + self.assertFalse(linkcheck.url.url_needs_quoting(nurl), + "Result URL %r must not need quoting" % nurl) + nurl1 = url_norm(url, encoding=encoding) + self.assertFalse(linkcheck.url.url_needs_quoting(nurl1), + "Normed URL %r needs quoting" % nurl) self.assertEquals(nurl1, nurl) - # Test with non-Unicode URLs - try: - cs = "iso8859-1" - url = url.decode(cs) - nurl = nurl.decode(cs) - nurl1 = url_norm(url) - self.assertFalse(linkcheck.url.url_needs_quoting(nurl1)) - self.assertEquals(nurl1, nurl) - except UnicodeEncodeError: - # Ignore non-Latin1 URLs - pass def test_pathattack (self): # Windows winamp path attack prevention. @@ -147,7 +138,7 @@ class TestUrl (unittest.TestCase): self.urlnormtest(url, nurl) url = "http://localhost:8001/?quoted=ü" nurl = "http://localhost:8001/?quoted=%FC" - self.urlnormtest(url, nurl) + self.urlnormtest(url, nurl, encoding="iso-8859-1") url = "http://host/?a=b/c+d=" nurl = "http://host/?a=b%2Fc%20d%3D" self.urlnormtest(url, nurl) @@ -367,8 +358,8 @@ class TestUrl (unittest.TestCase): url = 'nntp:' nurl = 'nntp://' self.urlnormtest(url, nurl) - url = "news:§$%&/´`§%" - nurl = 'news:%A7%24%25%26/%B4%60%A7%25' + url = "news:!$%&/()=" + nurl = 'news:!%24%25%26/()=' self.urlnormtest(url, nurl) url = "news:comp.infosystems.www.servers.unix" nurl = url @@ -410,10 +401,21 @@ class TestUrl (unittest.TestCase): nurl = "file://c%7C/a/b.txt" self.urlnormtest(url, nurl) + def test_norm_file_unicode (self): + url = u"file:///a/b.txt" + nurl = url + self.urlnormtest(url, nurl) + url = u"file:///a/ä.txt" + nurl = u"file:///a/%E4.txt" + self.urlnormtest(url, nurl, encoding="iso-8859-1") + #url = u"file:///\u041c\u043e\u0448\u043a\u043e\u0432\u0430.bin" + #nurl = u"file:///a.bin" # XXX + #self.urlnormtest(url, nurl) + def test_norm_invalid (self): url = u"äöü?:" nurl = u"%E4%F6%FC?:" - self.urlnormtest(url, nurl) + self.urlnormtest(url, nurl, encoding="iso-8859-1") def test_fixing (self): # Test url fix method.