From a7eacd620011dbed3422a62e8c6f3fce9d1bf7df Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 27 Jul 2020 19:22:32 +0100 Subject: [PATCH 1/2] Add a test for a page with links to anchors Query and fragment URL parts for filesystem URLs are ignored, therefore test over http. --- tests/checker/data/http_anchor.html | 4 ++++ tests/checker/data/http_anchor.html.result | 17 +++++++++++++++++ tests/checker/test_anchor.py | 11 +++++++++++ 3 files changed, 32 insertions(+) create mode 100644 tests/checker/data/http_anchor.html create mode 100644 tests/checker/data/http_anchor.html.result diff --git a/tests/checker/data/http_anchor.html b/tests/checker/data/http_anchor.html new file mode 100644 index 00000000..8ba01be3 --- /dev/null +++ b/tests/checker/data/http_anchor.html @@ -0,0 +1,4 @@ +

+a +a_good +a_bad diff --git a/tests/checker/data/http_anchor.html.result b/tests/checker/data/http_anchor.html.result new file mode 100644 index 00000000..b9ebf0c7 --- /dev/null +++ b/tests/checker/data/http_anchor.html.result @@ -0,0 +1,17 @@ +url http://localhost:%(port)d/%(datadir)s/http_anchor.html +cache key http://localhost:%(port)d/%(datadir)s/http_anchor.html +real url http://localhost:%(port)d/%(datadir)s/http_anchor.html +valid + +url http_anchor.html#bad +cache key http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad +real url http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad +name a_bad +warning Anchor `bad' not found. Available anchors: `good'. +valid + +url http_anchor.html#good +cache key http://localhost:%(port)d/%(datadir)s/http_anchor.html#good +real url http://localhost:%(port)d/%(datadir)s/http_anchor.html#good +name a_good +valid diff --git a/tests/checker/test_anchor.py b/tests/checker/test_anchor.py index 8b980a15..7848e9e1 100644 --- a/tests/checker/test_anchor.py +++ b/tests/checker/test_anchor.py @@ -17,6 +17,7 @@ Test html anchor parsing and checking. """ from . import LinkCheckTest +from .httpserver import HttpServerTest class TestAnchor(LinkCheckTest): @@ -38,3 +39,13 @@ class TestAnchor(LinkCheckTest): "valid", ] self.direct(urlanchor, resultlines, confargs=confargs) + + +class TestHttpAnchor(HttpServerTest): + """ + Test checking of HTML pages containing links to anchors served over http. + """ + + def test_anchor_html(self): + confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1) + self.file_test("http_anchor.html", confargs=confargs) From 0912e8a2c1ccc0e1a901f27f4176ef9e0644f06a Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 27 Jul 2020 19:25:30 +0100 Subject: [PATCH 2/2] Don't strip the URL fragment from cache key if using AnchorCheck Else once one URL for a page has been checked, URLs with different fragments are skipped and not passed to AnchorCheck. eaa538c ("don't check one url multiple times", 2016-11-09) --- linkcheck/checker/urlbase.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index daa3135c..500d7aba 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -363,11 +363,13 @@ class UrlBase: def set_cache_url(self): """Set the URL to be used for caching.""" - # remove anchor from cached target url since we assume - # URLs with different anchors to have the same content - self.cache_url = urlutil.urlunsplit(self.urlparts[:4] + ['']) - if self.cache_url is not None: - assert isinstance(self.cache_url, str), repr(self.cache_url) + if "AnchorCheck" in self.aggregate.config["enabledplugins"]: + self.cache_url = self.url + else: + # remove anchor from cached target url since we assume + # URLs with different anchors to have the same content + self.cache_url = urlutil.urlunsplit(self.urlparts[:4] + ['']) + log.debug(LOG_CHECK, "cache_url '%s'", self.cache_url) def check_syntax(self): """