diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index 6c85393a..197499bf 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -474,10 +474,6 @@ AnchorCheck Checks validity of HTML anchors. -.. note:: - - The AnchorCheck plugin is currently broken and is disabled. - LocationInfo ^^^^^^^^^^^^ diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 59820060..032a6542 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -122,6 +122,14 @@ class FileUrl(urlbase.UrlBase): ) self.scheme = 'file' + def reset(self): + super().reset() + # the local file URI + self.url_without_anchor = None + # including the anchor in self.url allows the AnchorCheck plugin to be + # used when checking files. The anchor is stripped in UrlBase.set_cache_url() + # if AnchorCheck is not being used. + def build_base_url(self): """The URL is normed according to the platform: - the base URL is made an absolute *file://* URL @@ -162,23 +170,20 @@ class FileUrl(urlbase.UrlBase): # of the base URL are removed first. # Otherwise the join function thinks the query is part of # the file name. - from .urlbase import url_norm - - # norm base url - can raise UnicodeError from url.idna_encode() - base_url, is_idn = url_norm(self.base_url, self.encoding) - urlparts = list(urllib.parse.urlsplit(base_url)) + urlparts = list(urllib.parse.urlsplit(self.base_url)) # ignore query part for filesystem urls urlparts[3] = '' self.base_url = urlutil.urlunsplit(urlparts) super().build_url() - # ignore query and fragment url parts for filesystem urls - self.urlparts[3] = self.urlparts[4] = '' + # ignore query url part for filesystem urls + self.urlparts[3] = '' if self.is_directory() and not self.urlparts[2].endswith('/'): self.add_warning( _("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH ) self.urlparts[2] += '/' self.url = urlutil.urlunsplit(self.urlparts) + self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + ['']) def add_size_info(self): """Get size of file content and modification time from filename path.""" @@ -204,7 +209,7 @@ class FileUrl(urlbase.UrlBase): if self.is_directory(): self.set_result(_("directory")) else: - url = fileutil.path_safe(self.url) + url = fileutil.path_safe(self.url_without_anchor) self.url_connection = urllib.request.urlopen(url) self.check_case_sensitivity() @@ -270,7 +275,7 @@ class FileUrl(urlbase.UrlBase): """ if self.is_directory(): return True - if firefox.has_sqlite and firefox.extension.search(self.url): + if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor): return True return self.is_content_type_parseable() @@ -278,7 +283,8 @@ class FileUrl(urlbase.UrlBase): """Return URL content type, or an empty string if content type could not be found.""" if self.url: - self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content) + self.content_type = mimeutil.guess_mimetype( + self.url_without_anchor, read=self.get_content) else: self.content_type = "" diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 9e9e16b9..d556b6db 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -458,7 +458,7 @@ class UrlBase: self.base_ref = urljoin(self.parent_url, self.base_ref) self.url = urljoin(self.base_ref, base_url) elif self.parent_url: - # strip the parent url query and anchor + # strip the parent url anchor urlparts = list(urllib.parse.urlsplit(self.parent_url)) urlparts[4] = "" parent_url = urlutil.urlunsplit(urlparts) @@ -512,7 +512,7 @@ class UrlBase: urlparts[1] = "%s@%s" % (self.userinfo, host) else: urlparts[1] = host - # safe anchor for later checking + # save anchor for later checking self.anchor = split.fragment if self.anchor is not None: assert isinstance(self.anchor, str), repr(self.anchor) diff --git a/linkcheck/plugins/anchorcheck.py b/linkcheck/plugins/anchorcheck.py index 70f52d33..f7576b36 100644 --- a/linkcheck/plugins/anchorcheck.py +++ b/linkcheck/plugins/anchorcheck.py @@ -26,25 +26,25 @@ from ..htmlutil import linkparse class AnchorCheck(_ContentPlugin): """Checks validity of HTML anchors.""" - def __init__(self, config): - """Initialize plugin.""" - super().__init__(config) - log.warn( - LOG_PLUGIN, _("Anchor check plugin is broken. Fixes welcome.") - ) - def applies_to(self, url_data): """Check for HTML anchor existence.""" - return False # XXX Plugin disabled return url_data.is_html() and url_data.anchor def check(self, url_data): """Check content for invalid anchors.""" log.debug(LOG_PLUGIN, "checking content for invalid anchors") + url_anchor_check = UrlAnchorCheck() + linkparse.find_links( + url_data.get_soup(), url_anchor_check.add_anchor, linkparse.AnchorTags) + url_anchor_check.check_anchor(url_data) + + +class UrlAnchorCheck: + """Class to thread-safely handle collecting anchors for a URL""" + + def __init__(self): # list of parsed anchors self.anchors = [] - linkparse.find_links(url_data.get_soup(), self.add_anchor, linkparse.AnchorTags) - self.check_anchor(url_data) def add_anchor(self, url, line, column, name, base): """Add anchor URL.""" @@ -54,17 +54,20 @@ class AnchorCheck(_ContentPlugin): """If URL is valid, parseable and has an anchor, check it. A warning is logged and True is returned if the anchor is not found. """ - log.debug(LOG_PLUGIN, "checking anchor %r in %s", url_data.anchor, self.anchors) - if any(x for x in self.anchors if urllib.parse.quote(x[0]) == url_data.anchor): + decoded_anchor = urllib.parse.unquote( + url_data.anchor, encoding=url_data.encoding) + log.debug(LOG_PLUGIN, "checking anchor %r (decoded: %r) in %s", + url_data.anchor, decoded_anchor, self.anchors) + if any(x for x in self.anchors if x[0] == decoded_anchor): return if self.anchors: anchornames = sorted(set("`%s'" % x[0] for x in self.anchors)) anchors = ", ".join(anchornames) else: anchors = "-" - args = {"name": url_data.anchor, "anchors": anchors} + args = {"name": url_data.anchor, "decoded": decoded_anchor, "anchors": anchors} msg = "%s %s" % ( - _("Anchor `%(name)s' not found.") % args, + _("Anchor `%(name)s' (decoded: `%(decoded)s') not found.") % args, _("Available anchors: %(anchors)s.") % args, ) url_data.add_warning(msg) diff --git a/tests/checker/data/anchor1.html b/tests/checker/data/anchor1.html new file mode 100644 index 00000000..bc23a11e --- /dev/null +++ b/tests/checker/data/anchor1.html @@ -0,0 +1,10 @@ + +one one +one two +one three +one four + + +anchor1 one from 1 +anchor2 two from 1 +anchor2 three from 1 diff --git a/tests/checker/data/anchor1.html.result b/tests/checker/data/anchor1.html.result new file mode 100644 index 00000000..13902ab3 --- /dev/null +++ b/tests/checker/data/anchor1.html.result @@ -0,0 +1,76 @@ +url #twofour +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twofour +real url file://%(curdir)s/%(datadir)s/anchor2.html#twofour +name anchor2 four from 2 +valid + +url #threefour +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threefour +real url file://%(curdir)s/%(datadir)s/anchor3.html#threefour +name anchor3 four from 3 +valid + +url #oneone +cache key file://%(curdir)s/%(datadir)s/anchor1.html#oneone +real url file://%(curdir)s/%(datadir)s/anchor1.html#oneone +name anchor1 one from 1 +valid + +url #twoone +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twoone +real url file://%(curdir)s/%(datadir)s/anchor2.html#twoone +name anchor2 one from 2 +valid + +url #threeone +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threeone +real url file://%(curdir)s/%(datadir)s/anchor3.html#threeone +name anchor3 one from 3 +valid + +url anchor1.html#onefour +cache key file://%(curdir)s/%(datadir)s/anchor1.html#onefour +real url file://%(curdir)s/%(datadir)s/anchor1.html#onefour +name anchor1 four from 3 +valid + +url anchor1.html#onethree +cache key file://%(curdir)s/%(datadir)s/anchor1.html#onethree +real url file://%(curdir)s/%(datadir)s/anchor1.html#onethree +name anchor1 three from 3 +valid + +url anchor1.html#onetwo +cache key file://%(curdir)s/%(datadir)s/anchor1.html#onetwo +real url file://%(curdir)s/%(datadir)s/anchor1.html#onetwo +name anchor1 two from 3 +valid + +url anchor2.html#twothree +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twothree +real url file://%(curdir)s/%(datadir)s/anchor2.html#twothree +name anchor2 three from 1 +valid + +url anchor2.html#twotwo +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twotwo +real url file://%(curdir)s/%(datadir)s/anchor2.html#twotwo +name anchor2 two from 1 +valid + +url anchor3.html#threethree +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threethree +real url file://%(curdir)s/%(datadir)s/anchor3.html#threethree +name anchor3 three from 2 +valid + +url anchor3.html#threetwo +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threetwo +real url file://%(curdir)s/%(datadir)s/anchor3.html#threetwo +name anchor3 two from 2 +valid + +url file://%(curdir)s/%(datadir)s/anchor1.html +cache key file://%(curdir)s/%(datadir)s/anchor1.html +real url file://%(curdir)s/%(datadir)s/anchor1.html +valid diff --git a/tests/checker/data/anchor2.html b/tests/checker/data/anchor2.html new file mode 100644 index 00000000..781bc029 --- /dev/null +++ b/tests/checker/data/anchor2.html @@ -0,0 +1,13 @@ + +two one +two two +two three +two four + +<-- links --> +anchor2 one from 2 +anchor3 two from 2 +anchor3 three from 2 + +anchor2 four from 2 +anchor1 one from 2 diff --git a/tests/checker/data/anchor3.html b/tests/checker/data/anchor3.html new file mode 100644 index 00000000..a6fd1546 --- /dev/null +++ b/tests/checker/data/anchor3.html @@ -0,0 +1,13 @@ + +three one +three two +three three +three four + +<-- links --> +anchor3 one from 3 +anchor1 two from 3 +anchor1 three from 3 + +anchor3 four from 3 +anchor1 four from 3 diff --git a/tests/checker/data/http_anchor.html.result b/tests/checker/data/http_anchor.html.result index b9ebf0c7..d2fb5c25 100644 --- a/tests/checker/data/http_anchor.html.result +++ b/tests/checker/data/http_anchor.html.result @@ -7,7 +7,7 @@ url http_anchor.html#bad cache key http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad real url http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad name a_bad -warning Anchor `bad' not found. Available anchors: `good'. +warning Anchor `bad' (decoded: `bad') not found. Available anchors: `good'. valid url http_anchor.html#good diff --git a/tests/checker/data/urlencoding_anchor.html b/tests/checker/data/urlencoding_anchor.html new file mode 100644 index 00000000..c8b059c4 --- /dev/null +++ b/tests/checker/data/urlencoding_anchor.html @@ -0,0 +1,25 @@ + +Good target +Needs-encoded target #1 +Needs-encoded target #2 +Target for partially-encoded testing + + +No anchor, will not be tested +Good anchor +Bad anchor + + +Non-encoded anchor +Partially-encoded version of the same anchor +Fully-encoded version of the same anchor + + +Distinct encoded anchor + + +Bad encoded anchor, partially encoded +Bad encoded anchor, fully encoded + + +Partially-encoded anchor, for testing diff --git a/tests/checker/data/urlencoding_anchor.html.file.result b/tests/checker/data/urlencoding_anchor.html.file.result new file mode 100644 index 00000000..88f86c13 --- /dev/null +++ b/tests/checker/data/urlencoding_anchor.html.file.result @@ -0,0 +1,42 @@ +url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html +valid + +url urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_%%28and%%E2%%80%%94em-dashes%%29 +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Distinct encoded anchor +valid + +url urlencoding_anchor.html#bad +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad +name Bad anchor +warning Anchor `bad' (decoded: `bad') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#bad_|_%%28and%%E2%%80%%94em-dashes%%29 +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +name Bad encoded anchor, partially encoded +warning Anchor `bad_%%7C_(and%%E2%%80%%94em-dashes)' (decoded: `bad_|_(and—em-dashes)') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#good +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#good +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#good +name Good anchor +valid + +url urlencoding_anchor.html#partially_%%7C_(encoded_em—dash) +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +name Partially-encoded anchor, for testing +valid + +url urlencoding_anchor.html#with_pipes_|_and_parentheses_(and—em-dashes) +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Non-encoded anchor +valid diff --git a/tests/checker/data/urlencoding_anchor.html.result b/tests/checker/data/urlencoding_anchor.html.result new file mode 100644 index 00000000..4793690c --- /dev/null +++ b/tests/checker/data/urlencoding_anchor.html.result @@ -0,0 +1,42 @@ +url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html +valid + +url urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_%%28and%%E2%%80%%94em-dashes%%29 +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Distinct encoded anchor +valid + +url urlencoding_anchor.html#bad +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad +name Bad anchor +warning Anchor `bad' (decoded: `bad') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#bad_|_%%28and%%E2%%80%%94em-dashes%%29 +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +name Bad encoded anchor, partially encoded +warning Anchor `bad_%%7C_(and%%E2%%80%%94em-dashes)' (decoded: `bad_|_(and—em-dashes)') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#good +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#good +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#good +name Good anchor +valid + +url urlencoding_anchor.html#partially_%%7C_(encoded_em—dash) +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +name Partially-encoded anchor, for testing +valid + +url urlencoding_anchor.html#with_pipes_|_and_parentheses_(and—em-dashes) +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Non-encoded anchor +valid diff --git a/tests/checker/httpserver.py b/tests/checker/httpserver.py index 44f527c4..4a28d5e4 100644 --- a/tests/checker/httpserver.py +++ b/tests/checker/httpserver.py @@ -161,9 +161,10 @@ class HttpServerTest(LinkCheckTest): self.port = None self.handler = NoQueryHttpRequestHandler - def setUp(self): + def setUp(self, https=False): """Start a new HTTP server in a new thread.""" - self.port = start_server(self.handler) + super().setUp() + self.port = start_server(self.handler, https) assert self.port is not None def tearDown(self): @@ -182,8 +183,7 @@ class HttpsServerTest(HttpServerTest): def setUp(self): """Start a new HTTPS server in a new thread.""" - self.port = start_server(self.handler, https=True) - assert self.port is not None + super().setUp(https=True) def tearDown(self): """Send QUIT request to http server.""" diff --git a/tests/checker/test_anchor.py b/tests/checker/test_anchor.py index a354674f..097e6a2d 100644 --- a/tests/checker/test_anchor.py +++ b/tests/checker/test_anchor.py @@ -16,32 +16,30 @@ """ Test html anchor parsing and checking. """ -import pytest - from . import LinkCheckTest from .httpserver import HttpServerTest -class TestAnchor(LinkCheckTest): +class TestFileAnchor(LinkCheckTest): """ - Test anchor checking of HTML pages. + Test anchor checking of HTML files. """ - @pytest.mark.xfail(strict=True) - def test_anchor(self): + def test_anchor_file(self): confargs = {"enabledplugins": ["AnchorCheck"]} - url = "file://%(curdir)s/%(datadir)s/anchor.html" % self.get_attrs() - nurl = self.norm(url) anchor = "broken" - urlanchor = url + "#" + anchor + url = "file://%(curdir)s/%(datadir)s/anchor.html#%(anchor)s" % self.get_attrs( + anchor=anchor) + nurl = self.norm(url) resultlines = [ - "url %s" % urlanchor, - "cache key %s" % nurl, - "real url %s" % nurl, - "warning Anchor `%s' not found. Available anchors: `myid:'." % anchor, + f"url {url}", + f"cache key {nurl}", + f"real url {nurl}", + f"warning Anchor `{anchor}' (decoded: `{anchor}') not found." + " Available anchors: `myid:'.", "valid", ] - self.direct(urlanchor, resultlines, confargs=confargs) + self.direct(url, resultlines, confargs=confargs) class TestHttpAnchor(HttpServerTest): @@ -49,7 +47,45 @@ class TestHttpAnchor(HttpServerTest): Test checking of HTML pages containing links to anchors served over http. """ - @pytest.mark.xfail(strict=True) - def test_anchor_html(self): + def test_anchor_http(self): confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1) self.file_test("http_anchor.html", confargs=confargs) + + +class TestEncodedAnchors(HttpServerTest): + """Test HTML pages containing urlencoded links to anchors""" + + def test_anchor_encoded_http(self): + """ + http:// + """ + confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1) + self.file_test("urlencoding_anchor.html", confargs=confargs) + + def test_anchor_encoded_file(self): + """ + file:// + This should have identical behavior as http:// + """ + filename = "urlencoding_anchor.html" + confargs = {"enabledplugins": ["AnchorCheck"]} + url = "file://%(curdir)s/%(datadir)s/%(filename)s" % self.get_attrs( + filename=filename) + # get results from the special result file that has `.file.` in its name + resultlines = self.get_resultlines(f"{filename}.file") + self.direct(url, resultlines, recursionlevel=1, confargs=confargs) + + +class TestAnchorsAcrossMultipleFiles(LinkCheckTest): + """Test anchors when there are multiple files""" + + def test_anchor1_file(self): + """ + Test a network of files that reference each other, starting with anchor1.html + """ + filename = "anchor1.html" + confargs = {"enabledplugins": ["AnchorCheck"]} + url = "file://%(curdir)s/%(datadir)s/%(filename)s" % self.get_attrs( + filename=filename) + resultlines = self.get_resultlines(filename) + self.direct(url, resultlines, recursionlevel=4, confargs=confargs)