diff --git a/doc/changelog.txt b/doc/changelog.txt index 40f55dbf..59805f01 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -12,6 +12,7 @@ Fixes: - gui: Fix saving of the debugmemory option. - checking: Do not handle attribute as parent URL but as normal URL to be checked. +- checking: Fix UNC path handling on Windows. 7.9 "The Dark Knight" (released 10.6.2012) diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index c25cea23..cbbff605 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -25,7 +25,7 @@ import urllib import urllib2 from . import urlbase, get_index_html, get_url_from -from .. import log, LOG_CHECK, fileutil, LinkCheckerError +from .. import log, LOG_CHECK, fileutil, LinkCheckerError, url as urlutil from ..bookmarks import firefox from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH @@ -137,7 +137,7 @@ class FileUrl (urlbase.UrlBase): urlparts = list(urlparse.urlsplit(base_url)) # ignore query part for filesystem urls urlparts[3] = '' - self.base_url = urlparse.urlunsplit(urlparts) + self.base_url = urlutil.urlunsplit(urlparts) super(FileUrl, self).build_url() # ignore query and fragment url parts for filesystem urls self.urlparts[3] = self.urlparts[4] = '' @@ -145,7 +145,7 @@ class FileUrl (urlbase.UrlBase): self.add_warning(_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH) self.urlparts[2] += '/' - self.url = urlparse.urlunsplit(self.urlparts) + self.url = urlutil.urlunsplit(self.urlparts) def add_size_info (self): """Get size of file content from filename path.""" diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 039050c6..d1acb671 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -161,7 +161,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): "a GET request was used instead.") % {"name": server}) # redirections might have changed the URL - self.url = urlparse.urlunsplit(self.urlparts) + self.url = urlutil.urlunsplit(self.urlparts) # check response if response: self.check_response(response) @@ -305,7 +305,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): num, response = self.follow_redirection(response, set_result, redirected) if num == -1: return num, response - redirected = urlparse.urlunsplit(self.urlparts) + redirected = urlutil.urlunsplit(self.urlparts) tries += num return tries, response @@ -528,10 +528,10 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # http://tools.ietf.org/html/rfc2616#section-3.2.2 anchor = '' if self.proxy: - path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1], + path = urlutil.urlunsplit((self.urlparts[0], self.urlparts[1], self.urlparts[2], self.urlparts[3], anchor)) else: - path = urlparse.urlunsplit(('', '', self.urlparts[2], + path = urlutil.urlunsplit(('', '', self.urlparts[2], self.urlparts[3], anchor)) self.url_connection.putrequest(self.method, path, skip_host=True, skip_accept_encoding=True) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 73f0f382..94bd9b05 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -355,7 +355,7 @@ class UrlBase (object): """ # remove anchor from content cache key since we assume # URLs with different anchors to have the same content - self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u'']) + self.cache_content_key = urlutil.urlunsplit(self.urlparts[:4]+[u'']) assert isinstance(self.cache_content_key, unicode), self log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key) # construct cache key @@ -380,7 +380,7 @@ class UrlBase (object): try: self.build_url() # check url warnings - effectiveurl = urlparse.urlunsplit(self.urlparts) + effectiveurl = urlutil.urlunsplit(self.urlparts) if self.url != effectiveurl: self.add_warning(_("Effective URL %(url)r.") % {"url": effectiveurl}, @@ -409,7 +409,7 @@ class UrlBase (object): # strip the parent url query and anchor urlparts = list(urlparse.urlsplit(self.parent_url)) urlparts[4] = "" - parent_url = urlparse.urlunsplit(urlparts) + parent_url = urlutil.urlunsplit(urlparts) self.url = urljoin(parent_url, base_url) else: self.url = base_url @@ -417,11 +417,11 @@ class UrlBase (object): urlparts = list(urlparse.urlsplit(self.url)) if urlparts[2]: urlparts[2] = urlutil.collapse_segments(urlparts[2]) - self.url = urlparse.urlunsplit(urlparts) + self.url = urlutil.urlunsplit(urlparts) # split into (modifiable) list self.urlparts = strformat.url_unicode_split(self.url) # and unsplit again - self.url = urlparse.urlunsplit(self.urlparts) + self.url = urlutil.urlunsplit(self.urlparts) # check userinfo@host:port syntax self.userinfo, host = urllib.splituser(self.urlparts[1]) # set host lowercase diff --git a/linkcheck/url.py b/linkcheck/url.py index 119dd7c2..c097d6e3 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -269,6 +269,19 @@ def url_parse_query (query, encoding=None): return ''.join(l) + append +def urlunsplit (urlparts): + """Same as urlparse.urlunsplit but with extra UNC path handling + for Windows OS.""" + res = urlparse.urlunsplit(urlparts) + if os.name == 'nt' and urlparts[0] == 'file' and '|' not in urlparts[2]: + # UNC paths must have 4 slashes: 'file:////server/path' + # Depending on the path in urlparts[2], urlparse.urlunsplit() + # left only two or three slashes. This is fixed below + repl = 'file://' if urlparts[2].startswith('//') else 'file:/' + res = res.replace('file:', repl) + return res + + def url_norm (url, encoding=None): """Normalize the given URL which must be quoted. Supports unicode hostnames (IDNA encoding) according to RFC 3490. @@ -315,7 +328,7 @@ def url_norm (url, encoding=None): urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor - res = urlparse.urlunsplit(urlparts) + res = urlunsplit(urlparts) if url.endswith('#') and not urlparts[4]: # re-append trailing empty fragment res += '#' @@ -384,7 +397,7 @@ def url_quote (url): l.append("%s%s" % (k, sep)) urlparts[3] = ''.join(l) urlparts[4] = url_quote_part(urlparts[4]) # anchor - return urlparse.urlunsplit(urlparts) + return urlunsplit(urlparts) def url_quote_part (s, safechars='/', encoding=None):