Fix UNC path handling on Windows.

2026-05-08 14:44:46 +00:00 · 2012-06-24 10:30:54 +02:00 · 2012-06-24 10:30:54 +02:00 · 5c045fef44
commit 5c045fef44
parent b550a9dcb5
5 changed files with 28 additions and 14 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -12,6 +12,7 @@ Fixes:
 - gui: Fix saving of the debugmemory option.
 - checking: Do not handle <object codebase="..."> attribute as parent
  URL but as normal URL to be checked.
+- checking: Fix UNC path handling on Windows.


 7.9 "The Dark Knight" (released 10.6.2012)
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -25,7 +25,7 @@ import urllib
 import urllib2

 from . import urlbase, get_index_html, get_url_from
-from .. import log, LOG_CHECK, fileutil, LinkCheckerError
+from .. import log, LOG_CHECK, fileutil, LinkCheckerError, url as urlutil
 from ..bookmarks import firefox
 from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH

@ -137,7 +137,7 @@ class FileUrl (urlbase.UrlBase):
            urlparts = list(urlparse.urlsplit(base_url))
            # ignore query part for filesystem urls
            urlparts[3] = ''
-            self.base_url = urlparse.urlunsplit(urlparts)
+            self.base_url = urlutil.urlunsplit(urlparts)
        super(FileUrl, self).build_url()
        # ignore query and fragment url parts for filesystem urls
        self.urlparts[3] = self.urlparts[4] = ''
@ -145,7 +145,7 @@ class FileUrl (urlbase.UrlBase):
            self.add_warning(_("Added trailing slash to directory."),
                           tag=WARN_FILE_MISSING_SLASH)
            self.urlparts[2] += '/'
-        self.url = urlparse.urlunsplit(self.urlparts)
+        self.url = urlutil.urlunsplit(self.urlparts)

    def add_size_info (self):
        """Get size of file content from filename path."""
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -161,7 +161,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
                            "a GET request was used instead.") %
                            {"name": server})
        # redirections might have changed the URL
-        self.url = urlparse.urlunsplit(self.urlparts)
+        self.url = urlutil.urlunsplit(self.urlparts)
        # check response
        if response:
            self.check_response(response)
@ -305,7 +305,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
            num, response = self.follow_redirection(response, set_result, redirected)
            if num == -1:
                return num, response
-            redirected = urlparse.urlunsplit(self.urlparts)
+            redirected = urlutil.urlunsplit(self.urlparts)
            tries += num
        return tries, response

@ -528,10 +528,10 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        # http://tools.ietf.org/html/rfc2616#section-3.2.2
        anchor = ''
        if self.proxy:
-            path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
+            path = urlutil.urlunsplit((self.urlparts[0], self.urlparts[1],
                                 self.urlparts[2], self.urlparts[3], anchor))
        else:
-            path = urlparse.urlunsplit(('', '', self.urlparts[2],
+            path = urlutil.urlunsplit(('', '', self.urlparts[2],
                                        self.urlparts[3], anchor))
        self.url_connection.putrequest(self.method, path, skip_host=True,
                                       skip_accept_encoding=True)
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -355,7 +355,7 @@ class UrlBase (object):
        """
        # remove anchor from content cache key since we assume
        # URLs with different anchors to have the same content
-        self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u''])
+        self.cache_content_key = urlutil.urlunsplit(self.urlparts[:4]+[u''])
        assert isinstance(self.cache_content_key, unicode), self
        log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
        # construct cache key
@ -380,7 +380,7 @@ class UrlBase (object):
        try:
            self.build_url()
            # check url warnings
-            effectiveurl = urlparse.urlunsplit(self.urlparts)
+            effectiveurl = urlutil.urlunsplit(self.urlparts)
            if self.url != effectiveurl:
                self.add_warning(_("Effective URL %(url)r.") %
                                 {"url": effectiveurl},
@ -409,7 +409,7 @@ class UrlBase (object):
            # strip the parent url query and anchor
            urlparts = list(urlparse.urlsplit(self.parent_url))
            urlparts[4] = ""
-            parent_url = urlparse.urlunsplit(urlparts)
+            parent_url = urlutil.urlunsplit(urlparts)
            self.url = urljoin(parent_url, base_url)
        else:
            self.url = base_url
@ -417,11 +417,11 @@ class UrlBase (object):
        urlparts = list(urlparse.urlsplit(self.url))
        if urlparts[2]:
            urlparts[2] = urlutil.collapse_segments(urlparts[2])
-        self.url = urlparse.urlunsplit(urlparts)
+        self.url = urlutil.urlunsplit(urlparts)
        # split into (modifiable) list
        self.urlparts = strformat.url_unicode_split(self.url)
        # and unsplit again
-        self.url = urlparse.urlunsplit(self.urlparts)
+        self.url = urlutil.urlunsplit(self.urlparts)
        # check userinfo@host:port syntax
        self.userinfo, host = urllib.splituser(self.urlparts[1])
        # set host lowercase
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -269,6 +269,19 @@ def url_parse_query (query, encoding=None):
    return ''.join(l) + append


+def urlunsplit (urlparts):
+    """Same as urlparse.urlunsplit but with extra UNC path handling
+    for Windows OS."""
+    res = urlparse.urlunsplit(urlparts)
+    if os.name == 'nt' and urlparts[0] == 'file' and '|' not in urlparts[2]:
+        # UNC paths must have 4 slashes: 'file:////server/path'
+        # Depending on the path in urlparts[2], urlparse.urlunsplit()
+        # left only two or three slashes. This is fixed below
+        repl = 'file://' if urlparts[2].startswith('//') else 'file:/'
+        res = res.replace('file:', repl)
+    return res
+
+
 def url_norm (url, encoding=None):
    """Normalize the given URL which must be quoted. Supports unicode
    hostnames (IDNA encoding) according to RFC 3490.
@ -315,7 +328,7 @@ def url_norm (url, encoding=None):
    urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
    urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path
    urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor
-    res = urlparse.urlunsplit(urlparts)
+    res = urlunsplit(urlparts)
    if url.endswith('#') and not urlparts[4]:
        # re-append trailing empty fragment
        res += '#'
@ -384,7 +397,7 @@ def url_quote (url):
            l.append("%s%s" % (k, sep))
    urlparts[3] = ''.join(l)
    urlparts[4] = url_quote_part(urlparts[4]) # anchor
-    return urlparse.urlunsplit(urlparts)
+    return urlunsplit(urlparts)


 def url_quote_part (s, safechars='/', encoding=None):