Merge pull request #669 from cjmayo/anchorcheck

Re-enable AnchorCheck plugin
This commit is contained in:
Chris Mayo 2022-10-03 19:36:08 +01:00 committed by GitHub
commit d9265bb71c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 313 additions and 51 deletions

View file

@ -474,10 +474,6 @@ AnchorCheck
Checks validity of HTML anchors.
.. note::
The AnchorCheck plugin is currently broken and is disabled.
LocationInfo
^^^^^^^^^^^^

View file

@ -122,6 +122,14 @@ class FileUrl(urlbase.UrlBase):
)
self.scheme = 'file'
def reset(self):
super().reset()
# the local file URI
self.url_without_anchor = None
# including the anchor in self.url allows the AnchorCheck plugin to be
# used when checking files. The anchor is stripped in UrlBase.set_cache_url()
# if AnchorCheck is not being used.
def build_base_url(self):
"""The URL is normed according to the platform:
- the base URL is made an absolute *file://* URL
@ -162,23 +170,20 @@ class FileUrl(urlbase.UrlBase):
# of the base URL are removed first.
# Otherwise the join function thinks the query is part of
# the file name.
from .urlbase import url_norm
# norm base url - can raise UnicodeError from url.idna_encode()
base_url, is_idn = url_norm(self.base_url, self.encoding)
urlparts = list(urllib.parse.urlsplit(base_url))
urlparts = list(urllib.parse.urlsplit(self.base_url))
# ignore query part for filesystem urls
urlparts[3] = ''
self.base_url = urlutil.urlunsplit(urlparts)
super().build_url()
# ignore query and fragment url parts for filesystem urls
self.urlparts[3] = self.urlparts[4] = ''
# ignore query url part for filesystem urls
self.urlparts[3] = ''
if self.is_directory() and not self.urlparts[2].endswith('/'):
self.add_warning(
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
)
self.urlparts[2] += '/'
self.url = urlutil.urlunsplit(self.urlparts)
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
def add_size_info(self):
"""Get size of file content and modification time from filename path."""
@ -204,7 +209,7 @@ class FileUrl(urlbase.UrlBase):
if self.is_directory():
self.set_result(_("directory"))
else:
url = fileutil.path_safe(self.url)
url = fileutil.path_safe(self.url_without_anchor)
self.url_connection = urllib.request.urlopen(url)
self.check_case_sensitivity()
@ -270,7 +275,7 @@ class FileUrl(urlbase.UrlBase):
"""
if self.is_directory():
return True
if firefox.has_sqlite and firefox.extension.search(self.url):
if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor):
return True
return self.is_content_type_parseable()
@ -278,7 +283,8 @@ class FileUrl(urlbase.UrlBase):
"""Return URL content type, or an empty string if content
type could not be found."""
if self.url:
self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content)
self.content_type = mimeutil.guess_mimetype(
self.url_without_anchor, read=self.get_content)
else:
self.content_type = ""

View file

@ -458,7 +458,7 @@ class UrlBase:
self.base_ref = urljoin(self.parent_url, self.base_ref)
self.url = urljoin(self.base_ref, base_url)
elif self.parent_url:
# strip the parent url query and anchor
# strip the parent url anchor
urlparts = list(urllib.parse.urlsplit(self.parent_url))
urlparts[4] = ""
parent_url = urlutil.urlunsplit(urlparts)
@ -512,7 +512,7 @@ class UrlBase:
urlparts[1] = "%s@%s" % (self.userinfo, host)
else:
urlparts[1] = host
# safe anchor for later checking
# save anchor for later checking
self.anchor = split.fragment
if self.anchor is not None:
assert isinstance(self.anchor, str), repr(self.anchor)

View file

@ -26,25 +26,25 @@ from ..htmlutil import linkparse
class AnchorCheck(_ContentPlugin):
"""Checks validity of HTML anchors."""
def __init__(self, config):
"""Initialize plugin."""
super().__init__(config)
log.warn(
LOG_PLUGIN, _("Anchor check plugin is broken. Fixes welcome.")
)
def applies_to(self, url_data):
"""Check for HTML anchor existence."""
return False # XXX Plugin disabled
return url_data.is_html() and url_data.anchor
def check(self, url_data):
"""Check content for invalid anchors."""
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
url_anchor_check = UrlAnchorCheck()
linkparse.find_links(
url_data.get_soup(), url_anchor_check.add_anchor, linkparse.AnchorTags)
url_anchor_check.check_anchor(url_data)
class UrlAnchorCheck:
"""Class to thread-safely handle collecting anchors for a URL"""
def __init__(self):
# list of parsed anchors
self.anchors = []
linkparse.find_links(url_data.get_soup(), self.add_anchor, linkparse.AnchorTags)
self.check_anchor(url_data)
def add_anchor(self, url, line, column, name, base):
"""Add anchor URL."""
@ -54,17 +54,20 @@ class AnchorCheck(_ContentPlugin):
"""If URL is valid, parseable and has an anchor, check it.
A warning is logged and True is returned if the anchor is not found.
"""
log.debug(LOG_PLUGIN, "checking anchor %r in %s", url_data.anchor, self.anchors)
if any(x for x in self.anchors if urllib.parse.quote(x[0]) == url_data.anchor):
decoded_anchor = urllib.parse.unquote(
url_data.anchor, encoding=url_data.encoding)
log.debug(LOG_PLUGIN, "checking anchor %r (decoded: %r) in %s",
url_data.anchor, decoded_anchor, self.anchors)
if any(x for x in self.anchors if x[0] == decoded_anchor):
return
if self.anchors:
anchornames = sorted(set("`%s'" % x[0] for x in self.anchors))
anchors = ", ".join(anchornames)
else:
anchors = "-"
args = {"name": url_data.anchor, "anchors": anchors}
args = {"name": url_data.anchor, "decoded": decoded_anchor, "anchors": anchors}
msg = "%s %s" % (
_("Anchor `%(name)s' not found.") % args,
_("Anchor `%(name)s' (decoded: `%(decoded)s') not found.") % args,
_("Available anchors: %(anchors)s.") % args,
)
url_data.add_warning(msg)

View file

@ -0,0 +1,10 @@
<!-- targets -->
<a name="oneone">one one</a>
<a name="onetwo">one two</a>
<a name="onethree">one three</a>
<a name="onefour">one four</a>
<!-- links -->
<a href="#oneone">anchor1 one from 1</a>
<a href="anchor2.html#twotwo">anchor2 two from 1</a>
<a href="anchor2.html#twothree">anchor2 three from 1</a>

View file

@ -0,0 +1,76 @@
url #twofour
cache key file://%(curdir)s/%(datadir)s/anchor2.html#twofour
real url file://%(curdir)s/%(datadir)s/anchor2.html#twofour
name anchor2 four from 2
valid
url #threefour
cache key file://%(curdir)s/%(datadir)s/anchor3.html#threefour
real url file://%(curdir)s/%(datadir)s/anchor3.html#threefour
name anchor3 four from 3
valid
url #oneone
cache key file://%(curdir)s/%(datadir)s/anchor1.html#oneone
real url file://%(curdir)s/%(datadir)s/anchor1.html#oneone
name anchor1 one from 1
valid
url #twoone
cache key file://%(curdir)s/%(datadir)s/anchor2.html#twoone
real url file://%(curdir)s/%(datadir)s/anchor2.html#twoone
name anchor2 one from 2
valid
url #threeone
cache key file://%(curdir)s/%(datadir)s/anchor3.html#threeone
real url file://%(curdir)s/%(datadir)s/anchor3.html#threeone
name anchor3 one from 3
valid
url anchor1.html#onefour
cache key file://%(curdir)s/%(datadir)s/anchor1.html#onefour
real url file://%(curdir)s/%(datadir)s/anchor1.html#onefour
name anchor1 four from 3
valid
url anchor1.html#onethree
cache key file://%(curdir)s/%(datadir)s/anchor1.html#onethree
real url file://%(curdir)s/%(datadir)s/anchor1.html#onethree
name anchor1 three from 3
valid
url anchor1.html#onetwo
cache key file://%(curdir)s/%(datadir)s/anchor1.html#onetwo
real url file://%(curdir)s/%(datadir)s/anchor1.html#onetwo
name anchor1 two from 3
valid
url anchor2.html#twothree
cache key file://%(curdir)s/%(datadir)s/anchor2.html#twothree
real url file://%(curdir)s/%(datadir)s/anchor2.html#twothree
name anchor2 three from 1
valid
url anchor2.html#twotwo
cache key file://%(curdir)s/%(datadir)s/anchor2.html#twotwo
real url file://%(curdir)s/%(datadir)s/anchor2.html#twotwo
name anchor2 two from 1
valid
url anchor3.html#threethree
cache key file://%(curdir)s/%(datadir)s/anchor3.html#threethree
real url file://%(curdir)s/%(datadir)s/anchor3.html#threethree
name anchor3 three from 2
valid
url anchor3.html#threetwo
cache key file://%(curdir)s/%(datadir)s/anchor3.html#threetwo
real url file://%(curdir)s/%(datadir)s/anchor3.html#threetwo
name anchor3 two from 2
valid
url file://%(curdir)s/%(datadir)s/anchor1.html
cache key file://%(curdir)s/%(datadir)s/anchor1.html
real url file://%(curdir)s/%(datadir)s/anchor1.html
valid

View file

@ -0,0 +1,13 @@
<!-- targets -->
<a name="twoone">two one</a>
<a name="twotwo">two two</a>
<a name="twothree">two three</a>
<a name="twofour">two four</a>
<-- links -->
<a href="#twoone">anchor2 one from 2</a>
<a href="anchor3.html#threetwo">anchor3 two from 2</a>
<a href="anchor3.html#threethree">anchor3 three from 2</a>
<a href="#twofour">anchor2 four from 2</a>
<a href="anchor1.html#oneone">anchor1 one from 2</a>

View file

@ -0,0 +1,13 @@
<!-- targets -->
<a name="threeone">three one</a>
<a name="threetwo">three two</a>
<a name="threethree">three three</a>
<a name="threefour">three four</a>
<-- links -->
<a href="#threeone">anchor3 one from 3</a>
<a href="anchor1.html#onetwo">anchor1 two from 3</a>
<a href="anchor1.html#onethree">anchor1 three from 3</a>
<a href="#threefour">anchor3 four from 3</a>
<a href="anchor1.html#onefour">anchor1 four from 3</a>

View file

@ -7,7 +7,7 @@ url http_anchor.html#bad
cache key http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad
real url http://localhost:%(port)d/%(datadir)s/http_anchor.html#bad
name a_bad
warning Anchor `bad' not found. Available anchors: `good'.
warning Anchor `bad' (decoded: `bad') not found. Available anchors: `good'.
valid
url http_anchor.html#good

View file

@ -0,0 +1,25 @@
<!-- targets -->
<a name="good">Good target</a>
<a name="with_pipes_|_and_parentheses_(and—em-dashes)">Needs-encoded target #1</a>
<a name="another_with_pipes_|_and_parentheses_(and—em-dashes)">Needs-encoded target #2</a>
<a name="partially_|_(encoded_em—dash)">Target for partially-encoded testing</a>
<!-- simple anchors -->
<a href="urlencoding_anchor.html">No anchor, will not be tested</a>
<a href="urlencoding_anchor.html#good">Good anchor</a>
<a href="urlencoding_anchor.html#bad">Bad anchor</a>
<!-- different encodings of the same complex anchor -->
<a href="urlencoding_anchor.html#with_pipes_|_and_parentheses_(and—em-dashes)">Non-encoded anchor</a>
<a href="urlencoding_anchor.html#with_pipes_|_and_parentheses_%28and%E2%80%94em-dashes%29">Partially-encoded version of the same anchor</a>
<a href="urlencoding_anchor.html#with_pipes_%7C_and_parentheses_%28and%E2%80%94em-dashes%29">Fully-encoded version of the same anchor</a>
<!-- just another complex anchor -->
<a href="urlencoding_anchor.html#another_with_pipes_%7C_and_parentheses_%28and%E2%80%94em-dashes%29">Distinct encoded anchor</a>
<!-- bad anchor, encoded two ways -->
<a href="urlencoding_anchor.html#bad_|_%28and%E2%80%94em-dashes%29">Bad encoded anchor, partially encoded</a>
<a href="urlencoding_anchor.html#bad_%7C_%28and%E2%80%94em-dashes%29">Bad encoded anchor, fully encoded</a>
<!-- stand-alone test for partially-encoded anchor -->
<a href="urlencoding_anchor.html#partially_%7C_(encoded_em—dash)">Partially-encoded anchor, for testing</a>

View file

@ -0,0 +1,42 @@
url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html
cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html
real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html
valid
url urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_%%28and%%E2%%80%%94em-dashes%%29
cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
name Distinct encoded anchor
valid
url urlencoding_anchor.html#bad
cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad
real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad
name Bad anchor
warning Anchor `bad' (decoded: `bad') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'.
valid
url urlencoding_anchor.html#bad_|_%%28and%%E2%%80%%94em-dashes%%29
cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes)
real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes)
name Bad encoded anchor, partially encoded
warning Anchor `bad_%%7C_(and%%E2%%80%%94em-dashes)' (decoded: `bad_|_(and—em-dashes)') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'.
valid
url urlencoding_anchor.html#good
cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#good
real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#good
name Good anchor
valid
url urlencoding_anchor.html#partially_%%7C_(encoded_em—dash)
cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash)
real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash)
name Partially-encoded anchor, for testing
valid
url urlencoding_anchor.html#with_pipes_|_and_parentheses_(and—em-dashes)
cache key file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
real url file://%(curdir)s/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
name Non-encoded anchor
valid

View file

@ -0,0 +1,42 @@
url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html
cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html
real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html
valid
url urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_%%28and%%E2%%80%%94em-dashes%%29
cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#another_with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
name Distinct encoded anchor
valid
url urlencoding_anchor.html#bad
cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad
real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad
name Bad anchor
warning Anchor `bad' (decoded: `bad') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'.
valid
url urlencoding_anchor.html#bad_|_%%28and%%E2%%80%%94em-dashes%%29
cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes)
real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#bad_%%7C_(and%%E2%%80%%94em-dashes)
name Bad encoded anchor, partially encoded
warning Anchor `bad_%%7C_(and%%E2%%80%%94em-dashes)' (decoded: `bad_|_(and—em-dashes)') not found. Available anchors: `another_with_pipes_|_and_parentheses_(and—em-dashes)', `good', `partially_|_(encoded_em—dash)', `with_pipes_|_and_parentheses_(and—em-dashes)'.
valid
url urlencoding_anchor.html#good
cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#good
real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#good
name Good anchor
valid
url urlencoding_anchor.html#partially_%%7C_(encoded_em—dash)
cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash)
real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#partially_%%7C_(encoded_em%%E2%%80%%94dash)
name Partially-encoded anchor, for testing
valid
url urlencoding_anchor.html#with_pipes_|_and_parentheses_(and—em-dashes)
cache key http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
real url http://localhost:%(port)d/%(datadir)s/urlencoding_anchor.html#with_pipes_%%7C_and_parentheses_(and%%E2%%80%%94em-dashes)
name Non-encoded anchor
valid

View file

@ -161,9 +161,10 @@ class HttpServerTest(LinkCheckTest):
self.port = None
self.handler = NoQueryHttpRequestHandler
def setUp(self):
def setUp(self, https=False):
"""Start a new HTTP server in a new thread."""
self.port = start_server(self.handler)
super().setUp()
self.port = start_server(self.handler, https)
assert self.port is not None
def tearDown(self):
@ -182,8 +183,7 @@ class HttpsServerTest(HttpServerTest):
def setUp(self):
"""Start a new HTTPS server in a new thread."""
self.port = start_server(self.handler, https=True)
assert self.port is not None
super().setUp(https=True)
def tearDown(self):
"""Send QUIT request to http server."""

View file

@ -16,32 +16,30 @@
"""
Test html anchor parsing and checking.
"""
import pytest
from . import LinkCheckTest
from .httpserver import HttpServerTest
class TestAnchor(LinkCheckTest):
class TestFileAnchor(LinkCheckTest):
"""
Test anchor checking of HTML pages.
Test anchor checking of HTML files.
"""
@pytest.mark.xfail(strict=True)
def test_anchor(self):
def test_anchor_file(self):
confargs = {"enabledplugins": ["AnchorCheck"]}
url = "file://%(curdir)s/%(datadir)s/anchor.html" % self.get_attrs()
nurl = self.norm(url)
anchor = "broken"
urlanchor = url + "#" + anchor
url = "file://%(curdir)s/%(datadir)s/anchor.html#%(anchor)s" % self.get_attrs(
anchor=anchor)
nurl = self.norm(url)
resultlines = [
"url %s" % urlanchor,
"cache key %s" % nurl,
"real url %s" % nurl,
"warning Anchor `%s' not found. Available anchors: `myid:'." % anchor,
f"url {url}",
f"cache key {nurl}",
f"real url {nurl}",
f"warning Anchor `{anchor}' (decoded: `{anchor}') not found."
" Available anchors: `myid:'.",
"valid",
]
self.direct(urlanchor, resultlines, confargs=confargs)
self.direct(url, resultlines, confargs=confargs)
class TestHttpAnchor(HttpServerTest):
@ -49,7 +47,45 @@ class TestHttpAnchor(HttpServerTest):
Test checking of HTML pages containing links to anchors served over http.
"""
@pytest.mark.xfail(strict=True)
def test_anchor_html(self):
def test_anchor_http(self):
confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1)
self.file_test("http_anchor.html", confargs=confargs)
class TestEncodedAnchors(HttpServerTest):
"""Test HTML pages containing urlencoded links to anchors"""
def test_anchor_encoded_http(self):
"""
http://
"""
confargs = dict(enabledplugins=["AnchorCheck"], recursionlevel=1)
self.file_test("urlencoding_anchor.html", confargs=confargs)
def test_anchor_encoded_file(self):
"""
file://
This should have identical behavior as http://
"""
filename = "urlencoding_anchor.html"
confargs = {"enabledplugins": ["AnchorCheck"]}
url = "file://%(curdir)s/%(datadir)s/%(filename)s" % self.get_attrs(
filename=filename)
# get results from the special result file that has `.file.` in its name
resultlines = self.get_resultlines(f"{filename}.file")
self.direct(url, resultlines, recursionlevel=1, confargs=confargs)
class TestAnchorsAcrossMultipleFiles(LinkCheckTest):
"""Test anchors when there are multiple files"""
def test_anchor1_file(self):
"""
Test a network of files that reference each other, starting with anchor1.html
"""
filename = "anchor1.html"
confargs = {"enabledplugins": ["AnchorCheck"]}
url = "file://%(curdir)s/%(datadir)s/%(filename)s" % self.get_attrs(
filename=filename)
resultlines = self.get_resultlines(filename)
self.direct(url, resultlines, recursionlevel=4, confargs=confargs)