diff --git a/doc/changelog.txt b/doc/changelog.txt index 3515aa25..024fb55d 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -4,6 +4,9 @@ Fixes: - checking: HTML parser detects and handles stray "<" characters before end tags. - checking: Reset content type setting after loading HTTP headers again. +- checking: Remove query and fragment parts of file URLs. Fixes false + errors checking sites on local file systems. + Closes: SF bug #3308753 7.0 "Plots with a View" (released 28.5.2011) diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 3be48aaf..516cf9a8 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -123,6 +123,18 @@ class FileUrl (urlbase.UrlBase): """ Calls super.build_url() and adds a trailing slash to directories. """ + if self.parent_url is not None: + # URL joining with the parent URL only works if the query + # of the base URL are removed first. + # Otherwise the join function thinks the query is part of + # the file name. + from .urlbase import url_norm + # norm base url - can raise UnicodeError from url.idna_encode() + base_url, is_idn = url_norm(self.base_url, self.encoding) + urlparts = list(urlparse.urlsplit(base_url)) + # ignore query part for filesystem urls + urlparts[3] = '' + self.base_url = urlparse.urlunsplit(urlparts) super(FileUrl, self).build_url() # ignore query and fragment url parts for filesystem urls self.urlparts[3] = self.urlparts[4] = '' diff --git a/tests/test_urlbuild.py b/tests/test_urlbuild.py index efa0e4b5..f2c8e098 100644 --- a/tests/test_urlbuild.py +++ b/tests/test_urlbuild.py @@ -56,6 +56,16 @@ class TestUrlBuild (unittest.TestCase): res = linkcheck.checker.urlbase.urljoin(parent_url, base_url, scheme) self.assertEqual(res, 'http://localhost:8001/;param=value') + def test_urljoin_file (self): + parent_url = "file:///a/b.html" + base_url = "?c=d" + recursion_level = 0 + aggregate = get_test_aggregate() + o = linkcheck.checker.fileurl.FileUrl(base_url, recursion_level, + aggregate, parent_url=parent_url) + o.build_url() + self.assertEqual(o.url, parent_url) + def test_http_build2 (self): parent_url = u'http://example.org/test?a=b&c=d' base_url = u'#usemap'