Merge pull request #683 from cjmayo/anchorcheckfileurl

Move AnchorCheck local file handling into a new class
This commit is contained in:
Chris Mayo 2022-10-31 19:23:27 +00:00 committed by GitHub
commit b796cec346
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 88 additions and 17 deletions

View file

@ -475,7 +475,9 @@ options in their section.
AnchorCheck
^^^^^^^^^^^
Checks validity of HTML anchors.
Checks validity of HTML anchors. When checking local files, URLs with anchors
that link to directories e.g. "example/#anchor" are not supported. There is no
such limitation when using http(s).
LocationInfo
^^^^^^^^^^^^
@ -568,6 +570,8 @@ WARNINGS
The following warnings are recognized in the 'ignorewarnings' config
file entry:
**file-anchorcheck-directory**
A local directory with an anchor, not supported by AnchorCheck.
**file-missing-slash**
The file: URL is missing a trailing slash.
**file-system-path**

View file

@ -131,6 +131,9 @@ def get_url_from(
else:
assume_local_file = recursion_level == 0
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
if "AnchorCheck" in aggregate.config["enabledplugins"] and \
klass == fileurl.FileUrl:
klass = fileurl.AnchorCheckFileUrl
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
return klass(
base_url,

View file

@ -91,6 +91,7 @@ WARN_URL_TOO_LONG = "url-too-long"
WARN_URL_WHITESPACE = "url-whitespace"
WARN_FILE_MISSING_SLASH = "file-missing-slash"
WARN_FILE_SYSTEM_PATH = "file-system-path"
WARN_FILE_ANCHORCHECK_DIRECTORY = "file-anchorcheck-directory"
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
WARN_HTTP_EMPTY_CONTENT = "http-empty-content"
WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
@ -113,6 +114,8 @@ Warnings = {
WARN_FILE_SYSTEM_PATH: _(
"The file: path is not the same as the system specific path."
),
WARN_FILE_ANCHORCHECK_DIRECTORY: _(
"A local directory with an anchor, not supported by AnchorCheck."),
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."),

View file

@ -26,7 +26,8 @@ from datetime import datetime
from . import urlbase, get_index_html
from .. import log, LOG_CHECK, fileutil, mimeutil, LinkCheckerError, url as urlutil
from ..bookmarks import firefox
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
from .const import (WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH,
WARN_FILE_ANCHORCHECK_DIRECTORY)
def get_files(dirname):
@ -122,14 +123,6 @@ class FileUrl(urlbase.UrlBase):
)
self.scheme = 'file'
def reset(self):
super().reset()
# the local file URI
self.url_without_anchor = None
# including the anchor in self.url allows the AnchorCheck plugin to be
# used when checking files. The anchor is stripped in UrlBase.set_cache_url()
# if AnchorCheck is not being used.
def build_base_url(self):
"""The URL is normed according to the platform:
- the base URL is made an absolute *file://* URL
@ -175,15 +168,14 @@ class FileUrl(urlbase.UrlBase):
urlparts[3] = ''
self.base_url = urlutil.urlunsplit(urlparts)
super().build_url()
# ignore query url part for filesystem urls
self.urlparts[3] = ''
# ignore query and fragment url parts for filesystem urls
self.urlparts[3] = self.urlparts[4] = ''
if self.is_directory() and not self.urlparts[2].endswith('/'):
self.add_warning(
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
)
self.urlparts[2] += '/'
self.url = urlutil.urlunsplit(self.urlparts)
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
def add_size_info(self):
"""Get size of file content and modification time from filename path."""
@ -209,7 +201,7 @@ class FileUrl(urlbase.UrlBase):
if self.is_directory():
self.set_result(_("directory"))
else:
url = fileutil.path_safe(self.url_without_anchor)
url = fileutil.path_safe(self.url)
self.url_connection = urllib.request.urlopen(url)
self.check_case_sensitivity()
@ -275,7 +267,7 @@ class FileUrl(urlbase.UrlBase):
"""
if self.is_directory():
return True
if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor):
if firefox.has_sqlite and firefox.extension.search(self.url):
return True
return self.is_content_type_parseable()
@ -283,8 +275,7 @@ class FileUrl(urlbase.UrlBase):
"""Set URL content type, or an empty string if content
type could not be found."""
if self.url:
self.content_type = mimeutil.guess_mimetype(
self.url_without_anchor, read=self.get_content)
self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content)
else:
self.content_type = ""
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
@ -314,3 +305,73 @@ class FileUrl(urlbase.UrlBase):
url = webroot + url[1:]
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
super().add_url(url, line=line, column=column, page=page, name=name, base=base)
class AnchorCheckFileUrl(FileUrl):
def reset(self):
super().reset()
# the local file URI
self.url_without_anchor = None
def build_url(self):
"""
Calls UrlBase.build_url() and adds a trailing slash to directories.
"""
self.build_base_url()
if self.parent_url is not None:
# URL joining with the parent URL only works if the query
# of the base URL are removed first.
# Otherwise the join function thinks the query is part of
# the file name.
urlparts = list(urllib.parse.urlsplit(self.base_url))
# ignore query part for filesystem urls
urlparts[3] = ''
self.base_url = urlutil.urlunsplit(urlparts)
super(FileUrl, self).build_url()
# ignore query url part for filesystem urls
self.urlparts[3] = ''
if self.is_directory() and not self.urlparts[2].endswith('/'):
self.add_warning(
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
)
self.urlparts[2] += '/'
self.url = urlutil.urlunsplit(self.urlparts)
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
def check_connection(self):
"""
Try to open the local file. Under NT systems the case sensitivity
is checked.
"""
if self.parent_url is not None and not self.parent_url.startswith("file:"):
msg = _(
"local files are only checked without parent URL or when"
" the parent URL is also a file"
)
raise LinkCheckerError(msg)
if self.is_directory():
if self.anchor:
self.add_warning(
_(
f" URL `{self.url}' is a directory with an anchor."
" When checking local files AnchorCheck does not support"
" anchors for directories."
),
tag=WARN_FILE_ANCHORCHECK_DIRECTORY,
)
self.set_result(_("directory"))
else:
url = fileutil.path_safe(self.url_without_anchor)
self.url_connection = urllib.request.urlopen(url)
self.check_case_sensitivity()
def set_content_type(self):
"""Set URL content type, or an empty string if content
type could not be found."""
if self.url:
self.content_type = mimeutil.guess_mimetype(
self.url_without_anchor, read=self.get_content)
else:
self.content_type = ""
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)