Work around a urlsplit() regression in Python >2.6

2026-05-08 22:54:51 +00:00 · 2011-03-11 15:18:21 +01:00 · 2011-03-11 15:18:21 +01:00 · 415c87e6cf
commit 415c87e6cf
parent 7b33cfac7b
1 changed files with 51 additions and 1 deletions
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -19,6 +19,7 @@ Functions for parsing and matching URL strings.
 """

 import re
+import sys
 import os
 import urlparse
 import urllib
@ -27,7 +28,56 @@ import socket
 from . import httplib2 as httplib
 from . import log, LOG_CHECK

-urlparse.uses_netloc.extend(('ldap', 'irc'))
+for scheme in ('ldap', 'irc'):
+    if scheme not in urlparse.uses_netloc:
+        urlparse.uses_netloc.append(scheme)
+
+if sys.version_info[0] > 2 or sys.version_info[1] > 6:
+    # Fix Python regression; see http://bugs.python.org/issue11467
+    def urlsplit_26(url, scheme='', allow_fragments=True):
+        """Parse a URL into 5 components:
+        <scheme>://<netloc>/<path>?<query>#<fragment>
+        Return a 5-tuple: (scheme, netloc, path, query, fragment).
+        Note that we don't break the components up in smaller bits
+        (e.g. netloc is a single string) and we don't expand % escapes."""
+        allow_fragments = bool(allow_fragments)
+        key = url, scheme, allow_fragments, type(url), type(scheme)
+        cached = urlparse._parse_cache.get(key, None)
+        if cached:
+            return cached
+        if len(urlparse._parse_cache) >= urlparse.MAX_CACHE_SIZE: # avoid runaway growth
+            urlparse.clear_cache()
+        netloc = query = fragment = ''
+        i = url.find(':')
+        if i > 0:
+            if url[:i] == 'http': # optimize the common case
+                scheme = url[:i].lower()
+                url = url[i+1:]
+                if url[:2] == '//':
+                    netloc, url = urlparse._splitnetloc(url, 2)
+                if allow_fragments and '#' in url:
+                    url, fragment = url.split('#', 1)
+                if '?' in url:
+                    url, query = url.split('?', 1)
+                v = urlparse.SplitResult(scheme, netloc, url, query, fragment)
+                urlparse._parse_cache[key] = v
+                return v
+            for c in url[:i]:
+                if c not in urlparse.scheme_chars:
+                    break
+            else:
+                scheme, url = url[:i].lower(), url[i+1:]
+        if url[:2] == '//':
+            netloc, url = urlparse._splitnetloc(url, 2)
+        if allow_fragments and scheme in urlparse.uses_fragment and '#' in url:
+            url, fragment = url.split('#', 1)
+        if scheme in urlparse.uses_query and '?' in url:
+            url, query = url.split('?', 1)
+        v = urlparse.SplitResult(scheme, netloc, url, query, fragment)
+        urlparse._parse_cache[key] = v
+        return v
+
+    urlparse.urlsplit = urlsplit_26

 # The character set to encode non-ASCII characters in a URL. See also
 # http://tools.ietf.org/html/rfc2396#section-2.1