mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Move AnchorCheck local file handling into a new class
When checking local files with AnchorCheck, anchors in URLs
like "example/#anchor" are not supported.
Without AnchorCheck enabled, the Real URL reported for such URLs
was changed to include the anchor when local file checking was added to
AnchorCheck, but it is the directory that is checked.
The same URL was also then used as the Parent URL for the check of each
of the contents of that directory.
For FileUrl this is a revert of:
c221afda ("Enable AnchorCheck to be used with local files", 2022-10-03)
This commit is contained in:
parent
b66ca30e84
commit
16bee50068
4 changed files with 88 additions and 17 deletions
|
|
@ -475,7 +475,9 @@ options in their section.
|
|||
AnchorCheck
|
||||
^^^^^^^^^^^
|
||||
|
||||
Checks validity of HTML anchors.
|
||||
Checks validity of HTML anchors. When checking local files, URLs with anchors
|
||||
that link to directories e.g. "example/#anchor" are not supported. There is no
|
||||
such limitation when using http(s).
|
||||
|
||||
LocationInfo
|
||||
^^^^^^^^^^^^
|
||||
|
|
@ -568,6 +570,8 @@ WARNINGS
|
|||
The following warnings are recognized in the 'ignorewarnings' config
|
||||
file entry:
|
||||
|
||||
**file-anchorcheck-directory**
|
||||
A local directory with an anchor, not supported by AnchorCheck.
|
||||
**file-missing-slash**
|
||||
The file: URL is missing a trailing slash.
|
||||
**file-system-path**
|
||||
|
|
|
|||
|
|
@ -131,6 +131,9 @@ def get_url_from(
|
|||
else:
|
||||
assume_local_file = recursion_level == 0
|
||||
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
|
||||
if "AnchorCheck" in aggregate.config["enabledplugins"] and \
|
||||
klass == fileurl.FileUrl:
|
||||
klass = fileurl.AnchorCheckFileUrl
|
||||
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
|
||||
return klass(
|
||||
base_url,
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ WARN_URL_TOO_LONG = "url-too-long"
|
|||
WARN_URL_WHITESPACE = "url-whitespace"
|
||||
WARN_FILE_MISSING_SLASH = "file-missing-slash"
|
||||
WARN_FILE_SYSTEM_PATH = "file-system-path"
|
||||
WARN_FILE_ANCHORCHECK_DIRECTORY = "file-anchorcheck-directory"
|
||||
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
|
||||
WARN_HTTP_EMPTY_CONTENT = "http-empty-content"
|
||||
WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
|
||||
|
|
@ -113,6 +114,8 @@ Warnings = {
|
|||
WARN_FILE_SYSTEM_PATH: _(
|
||||
"The file: path is not the same as the system specific path."
|
||||
),
|
||||
WARN_FILE_ANCHORCHECK_DIRECTORY: _(
|
||||
"A local directory with an anchor, not supported by AnchorCheck."),
|
||||
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
|
||||
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
|
||||
WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."),
|
||||
|
|
|
|||
|
|
@ -26,7 +26,8 @@ from datetime import datetime
|
|||
from . import urlbase, get_index_html
|
||||
from .. import log, LOG_CHECK, fileutil, mimeutil, LinkCheckerError, url as urlutil
|
||||
from ..bookmarks import firefox
|
||||
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
|
||||
from .const import (WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH,
|
||||
WARN_FILE_ANCHORCHECK_DIRECTORY)
|
||||
|
||||
|
||||
def get_files(dirname):
|
||||
|
|
@ -122,14 +123,6 @@ class FileUrl(urlbase.UrlBase):
|
|||
)
|
||||
self.scheme = 'file'
|
||||
|
||||
def reset(self):
|
||||
super().reset()
|
||||
# the local file URI
|
||||
self.url_without_anchor = None
|
||||
# including the anchor in self.url allows the AnchorCheck plugin to be
|
||||
# used when checking files. The anchor is stripped in UrlBase.set_cache_url()
|
||||
# if AnchorCheck is not being used.
|
||||
|
||||
def build_base_url(self):
|
||||
"""The URL is normed according to the platform:
|
||||
- the base URL is made an absolute *file://* URL
|
||||
|
|
@ -175,15 +168,14 @@ class FileUrl(urlbase.UrlBase):
|
|||
urlparts[3] = ''
|
||||
self.base_url = urlutil.urlunsplit(urlparts)
|
||||
super().build_url()
|
||||
# ignore query url part for filesystem urls
|
||||
self.urlparts[3] = ''
|
||||
# ignore query and fragment url parts for filesystem urls
|
||||
self.urlparts[3] = self.urlparts[4] = ''
|
||||
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
||||
self.add_warning(
|
||||
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
|
||||
)
|
||||
self.urlparts[2] += '/'
|
||||
self.url = urlutil.urlunsplit(self.urlparts)
|
||||
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
|
||||
|
||||
def add_size_info(self):
|
||||
"""Get size of file content and modification time from filename path."""
|
||||
|
|
@ -209,7 +201,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
if self.is_directory():
|
||||
self.set_result(_("directory"))
|
||||
else:
|
||||
url = fileutil.path_safe(self.url_without_anchor)
|
||||
url = fileutil.path_safe(self.url)
|
||||
self.url_connection = urllib.request.urlopen(url)
|
||||
self.check_case_sensitivity()
|
||||
|
||||
|
|
@ -275,7 +267,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
"""
|
||||
if self.is_directory():
|
||||
return True
|
||||
if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor):
|
||||
if firefox.has_sqlite and firefox.extension.search(self.url):
|
||||
return True
|
||||
return self.is_content_type_parseable()
|
||||
|
||||
|
|
@ -283,8 +275,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
"""Set URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
if self.url:
|
||||
self.content_type = mimeutil.guess_mimetype(
|
||||
self.url_without_anchor, read=self.get_content)
|
||||
self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content)
|
||||
else:
|
||||
self.content_type = ""
|
||||
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
|
||||
|
|
@ -314,3 +305,73 @@ class FileUrl(urlbase.UrlBase):
|
|||
url = webroot + url[1:]
|
||||
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
|
||||
super().add_url(url, line=line, column=column, page=page, name=name, base=base)
|
||||
|
||||
|
||||
class AnchorCheckFileUrl(FileUrl):
|
||||
def reset(self):
|
||||
super().reset()
|
||||
# the local file URI
|
||||
self.url_without_anchor = None
|
||||
|
||||
def build_url(self):
|
||||
"""
|
||||
Calls UrlBase.build_url() and adds a trailing slash to directories.
|
||||
"""
|
||||
self.build_base_url()
|
||||
if self.parent_url is not None:
|
||||
# URL joining with the parent URL only works if the query
|
||||
# of the base URL are removed first.
|
||||
# Otherwise the join function thinks the query is part of
|
||||
# the file name.
|
||||
urlparts = list(urllib.parse.urlsplit(self.base_url))
|
||||
# ignore query part for filesystem urls
|
||||
urlparts[3] = ''
|
||||
self.base_url = urlutil.urlunsplit(urlparts)
|
||||
super(FileUrl, self).build_url()
|
||||
# ignore query url part for filesystem urls
|
||||
self.urlparts[3] = ''
|
||||
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
||||
self.add_warning(
|
||||
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
|
||||
)
|
||||
self.urlparts[2] += '/'
|
||||
self.url = urlutil.urlunsplit(self.urlparts)
|
||||
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
|
||||
|
||||
def check_connection(self):
|
||||
"""
|
||||
Try to open the local file. Under NT systems the case sensitivity
|
||||
is checked.
|
||||
"""
|
||||
if self.parent_url is not None and not self.parent_url.startswith("file:"):
|
||||
msg = _(
|
||||
"local files are only checked without parent URL or when"
|
||||
" the parent URL is also a file"
|
||||
)
|
||||
raise LinkCheckerError(msg)
|
||||
if self.is_directory():
|
||||
if self.anchor:
|
||||
self.add_warning(
|
||||
_(
|
||||
f" URL `{self.url}' is a directory with an anchor."
|
||||
" When checking local files AnchorCheck does not support"
|
||||
" anchors for directories."
|
||||
),
|
||||
tag=WARN_FILE_ANCHORCHECK_DIRECTORY,
|
||||
)
|
||||
|
||||
self.set_result(_("directory"))
|
||||
else:
|
||||
url = fileutil.path_safe(self.url_without_anchor)
|
||||
self.url_connection = urllib.request.urlopen(url)
|
||||
self.check_case_sensitivity()
|
||||
|
||||
def set_content_type(self):
|
||||
"""Set URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
if self.url:
|
||||
self.content_type = mimeutil.guess_mimetype(
|
||||
self.url_without_anchor, read=self.get_content)
|
||||
else:
|
||||
self.content_type = ""
|
||||
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
|
||||
|
|
|
|||
Loading…
Reference in a new issue