Add url encoding parameter

2026-05-13 00:53:11 +00:00 · 2009-11-28 11:56:35 +01:00 · 2009-11-28 11:56:35 +01:00 · 77daf80e82
commit 77daf80e82
parent 84bcb84878
4 changed files with 68 additions and 51 deletions
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -89,7 +89,7 @@ class FileUrl (urlbase.UrlBase):
    """

    def init (self, base_ref, base_url, parent_url, recursion_level,
-              aggregate, line, column, name):
+              aggregate, line, column, name, url_encoding):
        """
        Besides the usual initialization the URL is normed according
        to the platform:
@ -97,7 +97,7 @@ class FileUrl (urlbase.UrlBase):
         - under Windows platform the drive specifier is normed
        """
        super(FileUrl, self).init(base_ref, base_url, parent_url,
-                               recursion_level, aggregate, line, column, name)
+               recursion_level, aggregate, line, column, name, url_encoding)
        if self.base_url is None:
            return
        base_url = self.base_url
@ -112,12 +112,7 @@ class FileUrl (urlbase.UrlBase):
            base_url = base_url.replace("\\", "/")
            # transform c:/windows into /c|/windows
            base_url = re.sub("^file://(/?)([a-zA-Z]):", r"file:///\2|", base_url)
-        # norm base url again after changing
-        if self.base_url != base_url:
-            base_url, is_idn = urlbase.url_norm(base_url)
-            if is_idn:
-                pass # XXX warn about idn use
-            self.base_url = unicode(base_url)
+        self.base_url = unicode(base_url)

    def build_url (self):
        """
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -53,12 +53,11 @@ def urljoin (parent, url, scheme):
    return urlparse.urljoin(parent, url)


-def url_norm (url):
-    """
-    Wrapper for url.url_norm() to convert UnicodeError in LinkCheckerError.
-    """
+def url_norm (url, encoding=None):
+    """Wrapper for url.url_norm() to convert UnicodeError in
+    LinkCheckerError."""
    try:
-        return urlutil.url_norm(url)
+        return urlutil.url_norm(url, encoding=encoding)
    except UnicodeError:
        msg = _("URL has unparsable domain name: %(name)s") % \
            {"name": sys.exc_info()[1]}
@ -69,8 +68,8 @@ class UrlBase (object):
    """An URL with additional information like validity etc."""

    def __init__ (self, base_url, recursion_level, aggregate,
-                  parent_url = None, base_ref = None,
-                  line = -1, column = -1, name = u""):
+                  parent_url=None, base_ref=None, line=-1, column=-1,
+                  name=u"", url_encoding=None):
        """
        Initialize check data, and store given variables.

@ -82,15 +81,16 @@ class UrlBase (object):
        @param line: line number of url in parent content
        @param column: column number of url in parent content
        @param name: name of url or empty
+        @param url_encoding: encoding of URL or None
        """
        self.init(base_ref, base_url, parent_url, recursion_level,
-                  aggregate, line, column, name)
+                  aggregate, line, column, name, url_encoding)
        self.reset()
        self.check_syntax()


    def init (self, base_ref, base_url, parent_url, recursion_level,
-              aggregate, line, column, name):
+              aggregate, line, column, name, url_encoding):
        """
        Initialize internal data.
        """
@ -103,6 +103,7 @@ class UrlBase (object):
        self.line = line
        self.column = column
        self.name = name
+        self.encoding = url_encoding
        if self.base_ref:
            assert not urlutil.url_needs_quoting(self.base_ref), \
                   "unquoted base reference URL %r" % self.base_ref
@ -338,7 +339,7 @@ class UrlBase (object):
        url information self.base_url, self.parent_url and self.base_ref.
        """
        # norm base url - can raise UnicodeError from url.idna_encode()
-        base_url, is_idn = url_norm(self.base_url)
+        base_url, is_idn = url_norm(self.base_url, self.encoding)
        if is_idn:
            self.add_warning(_("""URL %(url)r has a unicode domain name which
                          is not yet widely supported. You should use
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -25,6 +25,15 @@ import urllib

 urlparse.uses_netloc.extend(('ldap', 'irc'))

+# The character set to encode non-ASCII characters in a URL. See also
+# http://tools.ietf.org/html/rfc2396#section-2.1
+# Note that the encoding is not really specified, but most browsers
+# encode in UTF-8 when no encoding is specified by the HTTP headers,
+# else they use the page encoding for followed link. See als
+# http://code.google.com/p/browsersec/wiki/Part1#Unicode_in_URLs
+url_encoding = "utf-8"
+
+
 # constants defining url part indexes
 SCHEME = 0
 HOSTNAME = DOMAIN = 1
@ -230,10 +239,12 @@ def url_fix_mailto_urlsplit (urlparts):
        urlparts[2], urlparts[3] = urlparts[2].split('?', 1)


-def url_parse_query (query):
+def url_parse_query (query, encoding=None):
    """Parse and re-join the given CGI query."""
    if isinstance(query, unicode):
-        query = query.encode('iso8859-1', 'ignore')
+        if encoding is None:
+            encoding = url_encoding
+        query = query.encode(encoding, 'ignore')
    # if ? is in the query, split it off, seen at msdn.microsoft.com
    if '?' in query:
        query, append = query.split('?', 1)
@ -254,13 +265,20 @@ def url_parse_query (query):
    return ''.join(l) + append


-def url_norm (url):
+def url_norm (url, encoding=None):
    """Normalize the given URL which must be quoted. Supports unicode
    hostnames (IDNA encoding) according to RFC 3490.

    @return: (normed url, idna flag)
    @rtype: tuple of length two
    """
+    if isinstance(url, unicode):
+        # try to decode the URL to ascii since urllib.unquote()
+        # handles non-unicode strings differently
+        try:
+            url = url.encode('ascii')
+        except UnicodeEncodeError:
+            pass
    urlparts = list(urlparse.urlsplit(url))
    # scheme
    urlparts[0] = urllib.unquote(urlparts[0]).lower()
@ -270,7 +288,7 @@ def url_norm (url):
    # host (with path or query side effects)
    is_idn = url_fix_host(urlparts)
    # query
-    urlparts[3] = url_parse_query(urlparts[3])
+    urlparts[3] = url_parse_query(urlparts[3], encoding=encoding)
    is_hierarchical = urlparts[0] not in urlparse.non_hierarchical
    if is_hierarchical:
        # URL has a hierarchical path we should norm
@ -286,10 +304,10 @@ def url_norm (url):
    # anchor
    urlparts[4] = urllib.unquote(urlparts[4])
    # quote parts again
-    urlparts[0] = url_quote_part(urlparts[0]) # scheme
-    urlparts[1] = url_quote_part(urlparts[1], '@:') # host
-    urlparts[2] = url_quote_part(urlparts[2], _nopathquote_chars) # path
-    urlparts[4] = url_quote_part(urlparts[4]) # anchor
+    urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
+    urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
+    urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path
+    urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor
    res = urlparse.urlunsplit(urlparts)
    if url.endswith('#') and not urlparts[4]:
        # re-append trailing empty fragment
@ -362,12 +380,13 @@ def url_quote (url):
    return urlparse.urlunsplit(urlparts)


-def url_quote_part (s, safechars='/'):
+def url_quote_part (s, safechars='/', encoding=None):
    """Wrap urllib.quote() to support unicode strings. A unicode string
-    is first converted to ISO-8859-1, invalid characters are ignored.
-    After that urllib.quote() is called."""
+    is first converted to UTF-8. After that urllib.quote() is called."""
    if isinstance(s, unicode):
-        s = s.encode("iso-8859-1", "ignore")
+        if encoding is None:
+            encoding = url_encoding
+        s = s.encode(encoding, 'ignore')
    return urllib.quote(s, safechars)

 def document_quote (document):
--- a/tests/test_url.py
+++ b/tests/test_url.py
@ -37,29 +37,20 @@ import linkcheck.url
 #         (Latin capital letter C + Combining cedilla U+0327)


-def url_norm (url):
-    return linkcheck.url.url_norm(url)[0]
+def url_norm (url, encoding=None):
+    return linkcheck.url.url_norm(url, encoding=encoding)[0]


 class TestUrl (unittest.TestCase):
    """Test url norming and quoting."""

-    def urlnormtest (self, url, nurl):
-        self.assertFalse(linkcheck.url.url_needs_quoting(nurl))
-        nurl1 = url_norm(url)
-        self.assertFalse(linkcheck.url.url_needs_quoting(nurl1))
+    def urlnormtest (self, url, nurl, encoding=None):
+        self.assertFalse(linkcheck.url.url_needs_quoting(nurl),
+            "Result URL %r must not need quoting" % nurl)
+        nurl1 = url_norm(url, encoding=encoding)
+        self.assertFalse(linkcheck.url.url_needs_quoting(nurl1),
+            "Normed URL %r needs quoting" % nurl)
        self.assertEquals(nurl1, nurl)
-        # Test with non-Unicode URLs
-        try:
-            cs = "iso8859-1"
-            url = url.decode(cs)
-            nurl = nurl.decode(cs)
-            nurl1 = url_norm(url)
-            self.assertFalse(linkcheck.url.url_needs_quoting(nurl1))
-            self.assertEquals(nurl1, nurl)
-        except UnicodeEncodeError:
-            # Ignore non-Latin1 URLs
-            pass

    def test_pathattack (self):
        # Windows winamp path attack prevention.
@ -147,7 +138,7 @@ class TestUrl (unittest.TestCase):
        self.urlnormtest(url, nurl)
        url = "http://localhost:8001/?quoted=ü"
        nurl = "http://localhost:8001/?quoted=%FC"
-        self.urlnormtest(url, nurl)
+        self.urlnormtest(url, nurl, encoding="iso-8859-1")
        url = "http://host/?a=b/c+d="
        nurl = "http://host/?a=b%2Fc%20d%3D"
        self.urlnormtest(url, nurl)
@ -367,8 +358,8 @@ class TestUrl (unittest.TestCase):
        url = 'nntp:'
        nurl = 'nntp://'
        self.urlnormtest(url, nurl)
-        url = "news:§$%&/´`§%"
-        nurl = 'news:%A7%24%25%26/%B4%60%A7%25'
+        url = "news:!$%&/()="
+        nurl = 'news:!%24%25%26/()='
        self.urlnormtest(url, nurl)
        url = "news:comp.infosystems.www.servers.unix"
        nurl = url
@ -410,10 +401,21 @@ class TestUrl (unittest.TestCase):
        nurl = "file://c%7C/a/b.txt"
        self.urlnormtest(url, nurl)

+    def test_norm_file_unicode (self):
+        url = u"file:///a/b.txt"
+        nurl = url
+        self.urlnormtest(url, nurl)
+        url = u"file:///a/ä.txt"
+        nurl = u"file:///a/%E4.txt"
+        self.urlnormtest(url, nurl, encoding="iso-8859-1")
+        #url = u"file:///\u041c\u043e\u0448\u043a\u043e\u0432\u0430.bin"
+        #nurl = u"file:///a.bin" # XXX
+        #self.urlnormtest(url, nurl)
+
    def test_norm_invalid (self):
        url = u"äöü?:"
        nurl = u"%E4%F6%FC?:"
-        self.urlnormtest(url, nurl)
+        self.urlnormtest(url, nurl, encoding="iso-8859-1")

    def test_fixing (self):
        # Test url fix method.