Remove query part from file links.

This commit is contained in:
Bastian Kleineidam 2011-05-29 17:49:01 +02:00
parent 430e1db78d
commit 2550e16040
3 changed files with 25 additions and 0 deletions

View file

@ -4,6 +4,9 @@ Fixes:
- checking: HTML parser detects and handles stray "<" characters before
end tags.
- checking: Reset content type setting after loading HTTP headers again.
- checking: Remove query and fragment parts of file URLs. Fixes false
errors checking sites on local file systems.
Closes: SF bug #3308753
7.0 "Plots with a View" (released 28.5.2011)

View file

@ -123,6 +123,18 @@ class FileUrl (urlbase.UrlBase):
"""
Calls super.build_url() and adds a trailing slash to directories.
"""
if self.parent_url is not None:
# URL joining with the parent URL only works if the query
# of the base URL are removed first.
# Otherwise the join function thinks the query is part of
# the file name.
from .urlbase import url_norm
# norm base url - can raise UnicodeError from url.idna_encode()
base_url, is_idn = url_norm(self.base_url, self.encoding)
urlparts = list(urlparse.urlsplit(base_url))
# ignore query part for filesystem urls
urlparts[3] = ''
self.base_url = urlparse.urlunsplit(urlparts)
super(FileUrl, self).build_url()
# ignore query and fragment url parts for filesystem urls
self.urlparts[3] = self.urlparts[4] = ''

View file

@ -56,6 +56,16 @@ class TestUrlBuild (unittest.TestCase):
res = linkcheck.checker.urlbase.urljoin(parent_url, base_url, scheme)
self.assertEqual(res, 'http://localhost:8001/;param=value')
def test_urljoin_file (self):
parent_url = "file:///a/b.html"
base_url = "?c=d"
recursion_level = 0
aggregate = get_test_aggregate()
o = linkcheck.checker.fileurl.FileUrl(base_url, recursion_level,
aggregate, parent_url=parent_url)
o.build_url()
self.assertEqual(o.url, parent_url)
def test_http_build2 (self):
parent_url = u'http://example.org/test?a=b&c=d'
base_url = u'#usemap'