Move AnchorCheck local file handling into a new class

When checking local files with AnchorCheck, anchors in URLs
like "example/#anchor" are not supported.

Without AnchorCheck enabled, the Real URL reported for such URLs
was changed to include the anchor when local file checking was added to
AnchorCheck, but it is the directory that is checked.
The same URL was also then used as the Parent URL for the check of each
of the contents of that directory.

For FileUrl this is a revert of:
c221afda ("Enable AnchorCheck to be used with local files", 2022-10-03)
This commit is contained in:
Chris Mayo 2022-10-24 19:30:56 +01:00
parent b66ca30e84
commit 16bee50068
4 changed files with 88 additions and 17 deletions

View file

@ -475,7 +475,9 @@ options in their section.
AnchorCheck
^^^^^^^^^^^
Checks validity of HTML anchors.
Checks validity of HTML anchors. When checking local files, URLs with anchors
that link to directories e.g. "example/#anchor" are not supported. There is no
such limitation when using http(s).
LocationInfo
^^^^^^^^^^^^
@ -568,6 +570,8 @@ WARNINGS
The following warnings are recognized in the 'ignorewarnings' config
file entry:
**file-anchorcheck-directory**
A local directory with an anchor, not supported by AnchorCheck.
**file-missing-slash**
The file: URL is missing a trailing slash.
**file-system-path**

View file

@ -131,6 +131,9 @@ def get_url_from(
else:
assume_local_file = recursion_level == 0
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
if "AnchorCheck" in aggregate.config["enabledplugins"] and \
klass == fileurl.FileUrl:
klass = fileurl.AnchorCheckFileUrl
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
return klass(
base_url,

View file

@ -91,6 +91,7 @@ WARN_URL_TOO_LONG = "url-too-long"
WARN_URL_WHITESPACE = "url-whitespace"
WARN_FILE_MISSING_SLASH = "file-missing-slash"
WARN_FILE_SYSTEM_PATH = "file-system-path"
WARN_FILE_ANCHORCHECK_DIRECTORY = "file-anchorcheck-directory"
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
WARN_HTTP_EMPTY_CONTENT = "http-empty-content"
WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
@ -113,6 +114,8 @@ Warnings = {
WARN_FILE_SYSTEM_PATH: _(
"The file: path is not the same as the system specific path."
),
WARN_FILE_ANCHORCHECK_DIRECTORY: _(
"A local directory with an anchor, not supported by AnchorCheck."),
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."),

View file

@ -26,7 +26,8 @@ from datetime import datetime
from . import urlbase, get_index_html
from .. import log, LOG_CHECK, fileutil, mimeutil, LinkCheckerError, url as urlutil
from ..bookmarks import firefox
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
from .const import (WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH,
WARN_FILE_ANCHORCHECK_DIRECTORY)
def get_files(dirname):
@ -122,14 +123,6 @@ class FileUrl(urlbase.UrlBase):
)
self.scheme = 'file'
def reset(self):
super().reset()
# the local file URI
self.url_without_anchor = None
# including the anchor in self.url allows the AnchorCheck plugin to be
# used when checking files. The anchor is stripped in UrlBase.set_cache_url()
# if AnchorCheck is not being used.
def build_base_url(self):
"""The URL is normed according to the platform:
- the base URL is made an absolute *file://* URL
@ -175,15 +168,14 @@ class FileUrl(urlbase.UrlBase):
urlparts[3] = ''
self.base_url = urlutil.urlunsplit(urlparts)
super().build_url()
# ignore query url part for filesystem urls
self.urlparts[3] = ''
# ignore query and fragment url parts for filesystem urls
self.urlparts[3] = self.urlparts[4] = ''
if self.is_directory() and not self.urlparts[2].endswith('/'):
self.add_warning(
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
)
self.urlparts[2] += '/'
self.url = urlutil.urlunsplit(self.urlparts)
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
def add_size_info(self):
"""Get size of file content and modification time from filename path."""
@ -209,7 +201,7 @@ class FileUrl(urlbase.UrlBase):
if self.is_directory():
self.set_result(_("directory"))
else:
url = fileutil.path_safe(self.url_without_anchor)
url = fileutil.path_safe(self.url)
self.url_connection = urllib.request.urlopen(url)
self.check_case_sensitivity()
@ -275,7 +267,7 @@ class FileUrl(urlbase.UrlBase):
"""
if self.is_directory():
return True
if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor):
if firefox.has_sqlite and firefox.extension.search(self.url):
return True
return self.is_content_type_parseable()
@ -283,8 +275,7 @@ class FileUrl(urlbase.UrlBase):
"""Set URL content type, or an empty string if content
type could not be found."""
if self.url:
self.content_type = mimeutil.guess_mimetype(
self.url_without_anchor, read=self.get_content)
self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content)
else:
self.content_type = ""
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
@ -314,3 +305,73 @@ class FileUrl(urlbase.UrlBase):
url = webroot + url[1:]
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
super().add_url(url, line=line, column=column, page=page, name=name, base=base)
class AnchorCheckFileUrl(FileUrl):
def reset(self):
super().reset()
# the local file URI
self.url_without_anchor = None
def build_url(self):
"""
Calls UrlBase.build_url() and adds a trailing slash to directories.
"""
self.build_base_url()
if self.parent_url is not None:
# URL joining with the parent URL only works if the query
# of the base URL are removed first.
# Otherwise the join function thinks the query is part of
# the file name.
urlparts = list(urllib.parse.urlsplit(self.base_url))
# ignore query part for filesystem urls
urlparts[3] = ''
self.base_url = urlutil.urlunsplit(urlparts)
super(FileUrl, self).build_url()
# ignore query url part for filesystem urls
self.urlparts[3] = ''
if self.is_directory() and not self.urlparts[2].endswith('/'):
self.add_warning(
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
)
self.urlparts[2] += '/'
self.url = urlutil.urlunsplit(self.urlparts)
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
def check_connection(self):
"""
Try to open the local file. Under NT systems the case sensitivity
is checked.
"""
if self.parent_url is not None and not self.parent_url.startswith("file:"):
msg = _(
"local files are only checked without parent URL or when"
" the parent URL is also a file"
)
raise LinkCheckerError(msg)
if self.is_directory():
if self.anchor:
self.add_warning(
_(
f" URL `{self.url}' is a directory with an anchor."
" When checking local files AnchorCheck does not support"
" anchors for directories."
),
tag=WARN_FILE_ANCHORCHECK_DIRECTORY,
)
self.set_result(_("directory"))
else:
url = fileutil.path_safe(self.url_without_anchor)
self.url_connection = urllib.request.urlopen(url)
self.check_case_sensitivity()
def set_content_type(self):
"""Set URL content type, or an empty string if content
type could not be found."""
if self.url:
self.content_type = mimeutil.guess_mimetype(
self.url_without_anchor, read=self.get_content)
else:
self.content_type = ""
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)