From 54bcefd7d79bacdb68d7514709d70d61a4dfa794 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 01/10] Revert "Disable AnchorCheck plugin" This reverts commit 035652436949c7729e1957b9e2f013f0b868589a. --- linkcheck/plugins/anchorcheck.py | 8 -------- tests/checker/test_anchor.py | 4 ---- 2 files changed, 12 deletions(-) diff --git a/linkcheck/plugins/anchorcheck.py b/linkcheck/plugins/anchorcheck.py index 70f52d33..096fc638 100644 --- a/linkcheck/plugins/anchorcheck.py +++ b/linkcheck/plugins/anchorcheck.py @@ -26,16 +26,8 @@ from ..htmlutil import linkparse class AnchorCheck(_ContentPlugin): """Checks validity of HTML anchors.""" - def __init__(self, config): - """Initialize plugin.""" - super().__init__(config) - log.warn( - LOG_PLUGIN, _("Anchor check plugin is broken. Fixes welcome.") - ) - def applies_to(self, url_data): """Check for HTML anchor existence.""" - return False # XXX Plugin disabled return url_data.is_html() and url_data.anchor def check(self, url_data): diff --git a/tests/checker/test_anchor.py b/tests/checker/test_anchor.py index a354674f..7848e9e1 100644 --- a/tests/checker/test_anchor.py +++ b/tests/checker/test_anchor.py @@ -16,8 +16,6 @@ """ Test html anchor parsing and checking. """ -import pytest - from . import LinkCheckTest from .httpserver import HttpServerTest @@ -27,7 +25,6 @@ class TestAnchor(LinkCheckTest): Test anchor checking of HTML pages. """ - @pytest.mark.xfail(strict=True) def test_anchor(self): confargs = {"enabledplugins": ["AnchorCheck"]} url = "file://%(curdir)s/%(datadir)s/anchor.html" % self.get_attrs() @@ -49,7 +46,6 @@ class TestHttpAnchor(HttpServerTest): Test checking of HTML pages containing links to anchors served over http. """ - @pytest.mark.xfail(strict=True) def test_anchor_html(self): confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1) self.file_test("http_anchor.html", confargs=confargs) From 8b2fb868958888615533e818ce198783b410636c Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 02/10] Remove AnchorCheck disabled note in linkcheckerrc(5) A partial revert of: fe6dea12 ("Update documentation for disabled plugins", 2021-11-29) --- doc/src/man/linkcheckerrc.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index 6c85393a..197499bf 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -474,10 +474,6 @@ AnchorCheck Checks validity of HTML anchors. -.. note:: - - The AnchorCheck plugin is currently broken and is disabled. - LocationInfo ^^^^^^^^^^^^ From 2cbff4922116f9b3b9d74ea4cd5dda6fac176fd5 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 03/10] Fix http tests failing with pytest due to missing _() TypeError: 'NoneType' object is not callable Ensure LinkCheckTest.setUp() is called to initialise translations. --- tests/checker/httpserver.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/checker/httpserver.py b/tests/checker/httpserver.py index 44f527c4..4a28d5e4 100644 --- a/tests/checker/httpserver.py +++ b/tests/checker/httpserver.py @@ -161,9 +161,10 @@ class HttpServerTest(LinkCheckTest): self.port = None self.handler = NoQueryHttpRequestHandler - def setUp(self): + def setUp(self, https=False): """Start a new HTTP server in a new thread.""" - self.port = start_server(self.handler) + super().setUp() + self.port = start_server(self.handler, https) assert self.port is not None def tearDown(self): @@ -182,8 +183,7 @@ class HttpsServerTest(HttpServerTest): def setUp(self): """Start a new HTTPS server in a new thread.""" - self.port = start_server(self.handler, https=True) - assert self.port is not None + super().setUp(https=True) def tearDown(self): """Send QUIT request to http server.""" From a29750c57f6213c39d01b11de9cbfae4cb3081b5 Mon Sep 17 00:00:00 2001 From: Nathan Arthur Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 04/10] Fix anchor comments in UrlBase Parent url query not stripped since: 4a0c63aa ("Fix joining of URLs when parent URL has CGI parameter.", 2011-02-08) --- linkcheck/checker/urlbase.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 9e9e16b9..d556b6db 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -458,7 +458,7 @@ class UrlBase: self.base_ref = urljoin(self.parent_url, self.base_ref) self.url = urljoin(self.base_ref, base_url) elif self.parent_url: - # strip the parent url query and anchor + # strip the parent url anchor urlparts = list(urllib.parse.urlsplit(self.parent_url)) urlparts[4] = "" parent_url = urlutil.urlunsplit(urlparts) @@ -512,7 +512,7 @@ class UrlBase: urlparts[1] = "%s@%s" % (self.userinfo, host) else: urlparts[1] = host - # safe anchor for later checking + # save anchor for later checking self.anchor = split.fragment if self.anchor is not None: assert isinstance(self.anchor, str), repr(self.anchor) From c221afdab59341785ed88843d93b6832fb2bee81 Mon Sep 17 00:00:00 2001 From: Nathan Arthur Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 05/10] Enable AnchorCheck to be used with local files [I] discovered that fileurl.py was stripping the anchors from url_data, which breaks AnchorCheck. So I stopped it from doing that, and tried to fix up all the places that were assuming the url would map to a filesystem file. The tests all pass, but I'm not 100% sure I caught all the cases, or fixed them correctly. --- linkcheck/checker/fileurl.py | 20 +++++++++++++++----- tests/checker/test_anchor.py | 10 +++++----- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 59820060..d27b6d4a 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -122,6 +122,14 @@ class FileUrl(urlbase.UrlBase): ) self.scheme = 'file' + def reset(self): + super().reset() + # the local file URI + self.url_without_anchor = None + # including the anchor in self.url allows the AnchorCheck plugin to be + # used when checking files. The anchor is stripped in UrlBase.set_cache_url() + # if AnchorCheck is not being used. + def build_base_url(self): """The URL is normed according to the platform: - the base URL is made an absolute *file://* URL @@ -171,14 +179,15 @@ class FileUrl(urlbase.UrlBase): urlparts[3] = '' self.base_url = urlutil.urlunsplit(urlparts) super().build_url() - # ignore query and fragment url parts for filesystem urls - self.urlparts[3] = self.urlparts[4] = '' + # ignore query url part for filesystem urls + self.urlparts[3] = '' if self.is_directory() and not self.urlparts[2].endswith('/'): self.add_warning( _("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH ) self.urlparts[2] += '/' self.url = urlutil.urlunsplit(self.urlparts) + self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + ['']) def add_size_info(self): """Get size of file content and modification time from filename path.""" @@ -204,7 +213,7 @@ class FileUrl(urlbase.UrlBase): if self.is_directory(): self.set_result(_("directory")) else: - url = fileutil.path_safe(self.url) + url = fileutil.path_safe(self.url_without_anchor) self.url_connection = urllib.request.urlopen(url) self.check_case_sensitivity() @@ -270,7 +279,7 @@ class FileUrl(urlbase.UrlBase): """ if self.is_directory(): return True - if firefox.has_sqlite and firefox.extension.search(self.url): + if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor): return True return self.is_content_type_parseable() @@ -278,7 +287,8 @@ class FileUrl(urlbase.UrlBase): """Return URL content type, or an empty string if content type could not be found.""" if self.url: - self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content) + self.content_type = mimeutil.guess_mimetype( + self.url_without_anchor, read=self.get_content) else: self.content_type = "" diff --git a/tests/checker/test_anchor.py b/tests/checker/test_anchor.py index 7848e9e1..186ca05d 100644 --- a/tests/checker/test_anchor.py +++ b/tests/checker/test_anchor.py @@ -27,18 +27,18 @@ class TestAnchor(LinkCheckTest): def test_anchor(self): confargs = {"enabledplugins": ["AnchorCheck"]} - url = "file://%(curdir)s/%(datadir)s/anchor.html" % self.get_attrs() - nurl = self.norm(url) anchor = "broken" - urlanchor = url + "#" + anchor + url = "file://%(curdir)s/%(datadir)s/anchor.html#%(anchor)s" % self.get_attrs( + anchor=anchor) + nurl = self.norm(url) resultlines = [ - "url %s" % urlanchor, + "url %s" % url, "cache key %s" % nurl, "real url %s" % nurl, "warning Anchor `%s' not found. Available anchors: `myid:'." % anchor, "valid", ] - self.direct(urlanchor, resultlines, confargs=confargs) + self.direct(url, resultlines, confargs=confargs) class TestHttpAnchor(HttpServerTest): From 5398fd240658fee2841af73cb4917c9bc24cf9ec Mon Sep 17 00:00:00 2001 From: Nathan Arthur Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 06/10] Add an anchor test for multiple inter-connected files --- tests/checker/data/anchor1.html | 10 ++++ tests/checker/data/anchor1.html.result | 76 ++++++++++++++++++++++++++ tests/checker/data/anchor2.html | 13 +++++ tests/checker/data/anchor3.html | 13 +++++ tests/checker/test_anchor.py | 15 +++++ 5 files changed, 127 insertions(+) create mode 100644 tests/checker/data/anchor1.html create mode 100644 tests/checker/data/anchor1.html.result create mode 100644 tests/checker/data/anchor2.html create mode 100644 tests/checker/data/anchor3.html diff --git a/tests/checker/data/anchor1.html b/tests/checker/data/anchor1.html new file mode 100644 index 00000000..bc23a11e --- /dev/null +++ b/tests/checker/data/anchor1.html @@ -0,0 +1,10 @@ + +one one +one two +one three +one four + + +anchor1 one from 1 +anchor2 two from 1 +anchor2 three from 1 diff --git a/tests/checker/data/anchor1.html.result b/tests/checker/data/anchor1.html.result new file mode 100644 index 00000000..13902ab3 --- /dev/null +++ b/tests/checker/data/anchor1.html.result @@ -0,0 +1,76 @@ +url #twofour +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twofour +real url file://%(curdir)s/%(datadir)s/anchor2.html#twofour +name anchor2 four from 2 +valid + +url #threefour +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threefour +real url file://%(curdir)s/%(datadir)s/anchor3.html#threefour +name anchor3 four from 3 +valid + +url #oneone +cache key file://%(curdir)s/%(datadir)s/anchor1.html#oneone +real url file://%(curdir)s/%(datadir)s/anchor1.html#oneone +name anchor1 one from 1 +valid + +url #twoone +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twoone +real url file://%(curdir)s/%(datadir)s/anchor2.html#twoone +name anchor2 one from 2 +valid + +url #threeone +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threeone +real url file://%(curdir)s/%(datadir)s/anchor3.html#threeone +name anchor3 one from 3 +valid + +url anchor1.html#onefour +cache key file://%(curdir)s/%(datadir)s/anchor1.html#onefour +real url file://%(curdir)s/%(datadir)s/anchor1.html#onefour +name anchor1 four from 3 +valid + +url anchor1.html#onethree +cache key file://%(curdir)s/%(datadir)s/anchor1.html#onethree +real url file://%(curdir)s/%(datadir)s/anchor1.html#onethree +name anchor1 three from 3 +valid + +url anchor1.html#onetwo +cache key file://%(curdir)s/%(datadir)s/anchor1.html#onetwo +real url file://%(curdir)s/%(datadir)s/anchor1.html#onetwo +name anchor1 two from 3 +valid + +url anchor2.html#twothree +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twothree +real url file://%(curdir)s/%(datadir)s/anchor2.html#twothree +name anchor2 three from 1 +valid + +url anchor2.html#twotwo +cache key file://%(curdir)s/%(datadir)s/anchor2.html#twotwo +real url file://%(curdir)s/%(datadir)s/anchor2.html#twotwo +name anchor2 two from 1 +valid + +url anchor3.html#threethree +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threethree +real url file://%(curdir)s/%(datadir)s/anchor3.html#threethree +name anchor3 three from 2 +valid + +url anchor3.html#threetwo +cache key file://%(curdir)s/%(datadir)s/anchor3.html#threetwo +real url file://%(curdir)s/%(datadir)s/anchor3.html#threetwo +name anchor3 two from 2 +valid + +url file://%(curdir)s/%(datadir)s/anchor1.html +cache key file://%(curdir)s/%(datadir)s/anchor1.html +real url file://%(curdir)s/%(datadir)s/anchor1.html +valid diff --git a/tests/checker/data/anchor2.html b/tests/checker/data/anchor2.html new file mode 100644 index 00000000..781bc029 --- /dev/null +++ b/tests/checker/data/anchor2.html @@ -0,0 +1,13 @@ + +two one +two two +two three +two four + +<-- links --> +anchor2 one from 2 +anchor3 two from 2 +anchor3 three from 2 + +anchor2 four from 2 +anchor1 one from 2 diff --git a/tests/checker/data/anchor3.html b/tests/checker/data/anchor3.html new file mode 100644 index 00000000..a6fd1546 --- /dev/null +++ b/tests/checker/data/anchor3.html @@ -0,0 +1,13 @@ + +three one +three two +three three +three four + +<-- links --> +anchor3 one from 3 +anchor1 two from 3 +anchor1 three from 3 + +anchor3 four from 3 +anchor1 four from 3 diff --git a/tests/checker/test_anchor.py b/tests/checker/test_anchor.py index 186ca05d..df981cec 100644 --- a/tests/checker/test_anchor.py +++ b/tests/checker/test_anchor.py @@ -49,3 +49,18 @@ class TestHttpAnchor(HttpServerTest): def test_anchor_html(self): confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1) self.file_test("http_anchor.html", confargs=confargs) + + +class TestAnchorsAcrossMultipleFiles(LinkCheckTest): + """Test anchors when there are multiple files""" + + def test_anchor1_file(self): + """ + Test a network of files that reference each other, starting with anchor1.html + """ + filename = "anchor1.html" + confargs = {"enabledplugins": ["AnchorCheck"]} + url = "file://%(curdir)s/%(datadir)s/%(filename)s" % self.get_attrs( + filename=filename) + resultlines = self.get_resultlines(filename) + self.direct(url, resultlines, recursionlevel=4, confargs=confargs) From 6499b7b2334a499fdede9d58aa9f67adf0ce4ac9 Mon Sep 17 00:00:00 2001 From: Nathan Arthur Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 07/10] Fix a major thread-safety bug in AnchorCheck The threading issue has been there for years, but I didn't notice it until after I thought I was done, while I was doing manual testing (with threads re-enabled). The problem was with storing URL-specific state (.anchors) on the AnchorCheck object itself, because there's only one global AnchorCheck object, so all the threads are competing to use that one simgle variable (self.anchors). The solution was to create a new object to hold .anchors, for each processed URL. --- linkcheck/plugins/anchorcheck.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/linkcheck/plugins/anchorcheck.py b/linkcheck/plugins/anchorcheck.py index 096fc638..eda21535 100644 --- a/linkcheck/plugins/anchorcheck.py +++ b/linkcheck/plugins/anchorcheck.py @@ -33,10 +33,18 @@ class AnchorCheck(_ContentPlugin): def check(self, url_data): """Check content for invalid anchors.""" log.debug(LOG_PLUGIN, "checking content for invalid anchors") + url_anchor_check = UrlAnchorCheck() + linkparse.find_links( + url_data.get_soup(), url_anchor_check.add_anchor, linkparse.AnchorTags) + url_anchor_check.check_anchor(url_data) + + +class UrlAnchorCheck: + """Class to thread-safely handle collecting anchors for a URL""" + + def __init__(self): # list of parsed anchors self.anchors = [] - linkparse.find_links(url_data.get_soup(), self.add_anchor, linkparse.AnchorTags) - self.check_anchor(url_data) def add_anchor(self, url, line, column, name, base): """Add anchor URL.""" From 4cdaa59fcc58454ac4a5b96fed66780894dded1b Mon Sep 17 00:00:00 2001 From: Nathan Arthur Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 08/10] Fix AnchorCheck mismatching encoded anchors Problem identified by Christian Kirchhof. --- linkcheck/plugins/anchorcheck.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/linkcheck/plugins/anchorcheck.py b/linkcheck/plugins/anchorcheck.py index eda21535..f7576b36 100644 --- a/linkcheck/plugins/anchorcheck.py +++ b/linkcheck/plugins/anchorcheck.py @@ -54,17 +54,20 @@ class UrlAnchorCheck: """If URL is valid, parseable and has an anchor, check it. A warning is logged and True is returned if the anchor is not found. """ - log.debug(LOG_PLUGIN, "checking anchor %r in %s", url_data.anchor, self.anchors) - if any(x for x in self.anchors if urllib.parse.quote(x[0]) == url_data.anchor): + decoded_anchor = urllib.parse.unquote( + url_data.anchor, encoding=url_data.encoding) + log.debug(LOG_PLUGIN, "checking anchor %r (decoded: %r) in %s", + url_data.anchor, decoded_anchor, self.anchors) + if any(x for x in self.anchors if x[0] == decoded_anchor): return if self.anchors: anchornames = sorted(set("`%s'" % x[0] for x in self.anchors)) anchors = ", ".join(anchornames) else: anchors = "-" - args = {"name": url_data.anchor, "anchors": anchors} + args = {"name": url_data.anchor, "decoded": decoded_anchor, "anchors": anchors} msg = "%s %s" % ( - _("Anchor `%(name)s' not found.") % args, + _("Anchor `%(name)s' (decoded: `%(decoded)s') not found.") % args, _("Available anchors: %(anchors)s.") % args, ) url_data.add_warning(msg) From 33036803b08ca7caa9eab1218afd0abd1152bc4d Mon Sep 17 00:00:00 2001 From: Nathan Arthur Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 09/10] Fix a difference in anchor quoting between http and file "I added a test for file:// processing, and it was showing different results for when the URL anchor was and wasn't quoted. I tracked it down to code in fileurl.py that was calling url_norm, and I'm pretty sure the code is unnecessary at this point. But I made a minimally-invasive change, to be as safe as possible." UrlBase.build_url() in line 174 also calls url_norm() --- linkcheck/checker/fileurl.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index d27b6d4a..032a6542 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -170,11 +170,7 @@ class FileUrl(urlbase.UrlBase): # of the base URL are removed first. # Otherwise the join function thinks the query is part of # the file name. - from .urlbase import url_norm - - # norm base url - can raise UnicodeError from url.idna_encode() - base_url, is_idn = url_norm(self.base_url, self.encoding) - urlparts = list(urllib.parse.urlsplit(base_url)) + urlparts = list(urllib.parse.urlsplit(self.base_url)) # ignore query part for filesystem urls urlparts[3] = '' self.base_url = urlutil.urlunsplit(urlparts) From 2d1bf6ef986c9f5ed4028e5510eaabda73e35755 Mon Sep 17 00:00:00 2001 From: Nathan Arthur Date: Mon, 3 Oct 2022 19:33:05 +0100 Subject: [PATCH 10/10] Add tests for encoded anchors for file: and http: I started with a test of urlencoded anchors, assuming at the URL might have a urlencoded anchor, but the actual anchor in the HTML would NOT be urlencoded. --- tests/checker/data/http_anchor.html.result | 2 +- tests/checker/data/urlencoding_anchor.html | 25 +++++++++++ .../data/urlencoding_anchor.html.file.result | 42 +++++++++++++++++++ .../data/urlencoding_anchor.html.result | 42 +++++++++++++++++++ tests/checker/test_anchor.py | 41 ++++++++++++++---- 5 files changed, 143 insertions(+), 9 deletions(-) create mode 100644 tests/checker/data/urlencoding_anchor.html create mode 100644 tests/checker/data/urlencoding_anchor.html.file.result create mode 100644 tests/checker/data/urlencoding_anchor.html.result diff --git a/tests/checker/data/http_anchor.html.result b/tests/checker/data/http_anchor.html.result index b9ebf0c7..d2fb5c25 100644 --- a/tests/checker/data/http_anchor.html.result +++ b/tests/checker/data/http_anchor.html.result @@ -7,7 +7,7 @@ url http_anchor.html#bad cache key http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad real url http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad name a_bad -warning Anchor `bad' not found. Available anchors: `good'. +warning Anchor `bad' (decoded: `bad') not found. Available anchors: `good'. valid url http_anchor.html#good diff --git a/tests/checker/data/urlencoding_anchor.html b/tests/checker/data/urlencoding_anchor.html new file mode 100644 index 00000000..c8b059c4 --- /dev/null +++ b/tests/checker/data/urlencoding_anchor.html @@ -0,0 +1,25 @@ + +Good target +Needs-encoded target #1 +Needs-encoded target #2 +Target for partially-encoded testing + + +No anchor, will not be tested +Good anchor +Bad anchor + + +Non-encoded anchor +Partially-encoded version of the same anchor +Fully-encoded version of the same anchor + + +Distinct encoded anchor + + +Bad encoded anchor, partially encoded +Bad encoded anchor, fully encoded + + +Partially-encoded anchor, for testing diff --git a/tests/checker/data/urlencoding_anchor.html.file.result b/tests/checker/data/urlencoding_anchor.html.file.result new file mode 100644 index 00000000..88f86c13 --- /dev/null +++ b/tests/checker/data/urlencoding_anchor.html.file.result @@ -0,0 +1,42 @@ +url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html +valid + +url urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_%%28and%%E2%%80%%94em-dashes%%29 +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Distinct encoded anchor +valid + +url urlencoding_anchor.html#bad +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad +name Bad anchor +warning Anchor `bad' (decoded: `bad') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#bad_|_%%28and%%E2%%80%%94em-dashes%%29 +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +name Bad encoded anchor, partially encoded +warning Anchor `bad_%%7C_(and%%E2%%80%%94em-dashes)' (decoded: `bad_|_(and—em-dashes)') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#good +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#good +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#good +name Good anchor +valid + +url urlencoding_anchor.html#partially_%%7C_(encoded_em—dash) +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +name Partially-encoded anchor, for testing +valid + +url urlencoding_anchor.html#with_pipes_|_and_parentheses_(and—em-dashes) +cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Non-encoded anchor +valid diff --git a/tests/checker/data/urlencoding_anchor.html.result b/tests/checker/data/urlencoding_anchor.html.result new file mode 100644 index 00000000..4793690c --- /dev/null +++ b/tests/checker/data/urlencoding_anchor.html.result @@ -0,0 +1,42 @@ +url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html +valid + +url urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_%%28and%%E2%%80%%94em-dashes%%29 +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Distinct encoded anchor +valid + +url urlencoding_anchor.html#bad +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad +name Bad anchor +warning Anchor `bad' (decoded: `bad') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#bad_|_%%28and%%E2%%80%%94em-dashes%%29 +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes) +name Bad encoded anchor, partially encoded +warning Anchor `bad_%%7C_(and%%E2%%80%%94em-dashes)' (decoded: `bad_|_(and—em-dashes)') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'. +valid + +url urlencoding_anchor.html#good +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#good +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#good +name Good anchor +valid + +url urlencoding_anchor.html#partially_%%7C_(encoded_em—dash) +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash) +name Partially-encoded anchor, for testing +valid + +url urlencoding_anchor.html#with_pipes_|_and_parentheses_(and—em-dashes) +cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes) +name Non-encoded anchor +valid diff --git a/tests/checker/test_anchor.py b/tests/checker/test_anchor.py index df981cec..097e6a2d 100644 --- a/tests/checker/test_anchor.py +++ b/tests/checker/test_anchor.py @@ -20,22 +20,23 @@ from . import LinkCheckTest from .httpserver import HttpServerTest -class TestAnchor(LinkCheckTest): +class TestFileAnchor(LinkCheckTest): """ - Test anchor checking of HTML pages. + Test anchor checking of HTML files. """ - def test_anchor(self): + def test_anchor_file(self): confargs = {"enabledplugins": ["AnchorCheck"]} anchor = "broken" url = "file://%(curdir)s/%(datadir)s/anchor.html#%(anchor)s" % self.get_attrs( anchor=anchor) nurl = self.norm(url) resultlines = [ - "url %s" % url, - "cache key %s" % nurl, - "real url %s" % nurl, - "warning Anchor `%s' not found. Available anchors: `myid:'." % anchor, + f"url {url}", + f"cache key {nurl}", + f"real url {nurl}", + f"warning Anchor `{anchor}' (decoded: `{anchor}') not found." + " Available anchors: `myid:'.", "valid", ] self.direct(url, resultlines, confargs=confargs) @@ -46,11 +47,35 @@ class TestHttpAnchor(HttpServerTest): Test checking of HTML pages containing links to anchors served over http. """ - def test_anchor_html(self): + def test_anchor_http(self): confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1) self.file_test("http_anchor.html", confargs=confargs) +class TestEncodedAnchors(HttpServerTest): + """Test HTML pages containing urlencoded links to anchors""" + + def test_anchor_encoded_http(self): + """ + http:// + """ + confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1) + self.file_test("urlencoding_anchor.html", confargs=confargs) + + def test_anchor_encoded_file(self): + """ + file:// + This should have identical behavior as http:// + """ + filename = "urlencoding_anchor.html" + confargs = {"enabledplugins": ["AnchorCheck"]} + url = "file://%(curdir)s/%(datadir)s/%(filename)s" % self.get_attrs( + filename=filename) + # get results from the special result file that has `.file.` in its name + resultlines = self.get_resultlines(f"{filename}.file") + self.direct(url, resultlines, recursionlevel=1, confargs=confargs) + + class TestAnchorsAcrossMultipleFiles(LinkCheckTest): """Test anchors when there are multiple files"""