diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index b9b4087f..cb3be90f 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -475,7 +475,9 @@ options in their section. AnchorCheck ^^^^^^^^^^^ -Checks validity of HTML anchors. +Checks validity of HTML anchors. When checking local files, URLs with anchors +that link to directories e.g. "example/#anchor" are not supported. There is no +such limitation when using http(s). LocationInfo ^^^^^^^^^^^^ @@ -568,6 +570,8 @@ WARNINGS The following warnings are recognized in the 'ignorewarnings' config file entry: +**file-anchorcheck-directory** + A local directory with an anchor, not supported by AnchorCheck. **file-missing-slash** The file: URL is missing a trailing slash. **file-system-path** diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 075075ce..dd0a28f5 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -131,6 +131,9 @@ def get_url_from( else: assume_local_file = recursion_level == 0 klass = get_urlclass_from(scheme, assume_local_file=assume_local_file) + if "AnchorCheck" in aggregate.config["enabledplugins"] and \ + klass == fileurl.FileUrl: + klass = fileurl.AnchorCheckFileUrl log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url) return klass( base_url, diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index ac60dddd..bf4a8be6 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -91,6 +91,7 @@ WARN_URL_TOO_LONG = "url-too-long" WARN_URL_WHITESPACE = "url-whitespace" WARN_FILE_MISSING_SLASH = "file-missing-slash" WARN_FILE_SYSTEM_PATH = "file-system-path" +WARN_FILE_ANCHORCHECK_DIRECTORY = "file-anchorcheck-directory" WARN_FTP_MISSING_SLASH = "ftp-missing-slash" WARN_HTTP_EMPTY_CONTENT = "http-empty-content" WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error" @@ -113,6 +114,8 @@ Warnings = { WARN_FILE_SYSTEM_PATH: _( "The file: path is not the same as the system specific path." ), + WARN_FILE_ANCHORCHECK_DIRECTORY: _( + "A local directory with an anchor, not supported by AnchorCheck."), WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."), WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."), WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."), diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 868201d6..09a96139 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -26,7 +26,8 @@ from datetime import datetime from . import urlbase, get_index_html from .. import log, LOG_CHECK, fileutil, mimeutil, LinkCheckerError, url as urlutil from ..bookmarks import firefox -from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH +from .const import (WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, + WARN_FILE_ANCHORCHECK_DIRECTORY) def get_files(dirname): @@ -122,14 +123,6 @@ class FileUrl(urlbase.UrlBase): ) self.scheme = 'file' - def reset(self): - super().reset() - # the local file URI - self.url_without_anchor = None - # including the anchor in self.url allows the AnchorCheck plugin to be - # used when checking files. The anchor is stripped in UrlBase.set_cache_url() - # if AnchorCheck is not being used. - def build_base_url(self): """The URL is normed according to the platform: - the base URL is made an absolute *file://* URL @@ -175,15 +168,14 @@ class FileUrl(urlbase.UrlBase): urlparts[3] = '' self.base_url = urlutil.urlunsplit(urlparts) super().build_url() - # ignore query url part for filesystem urls - self.urlparts[3] = '' + # ignore query and fragment url parts for filesystem urls + self.urlparts[3] = self.urlparts[4] = '' if self.is_directory() and not self.urlparts[2].endswith('/'): self.add_warning( _("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH ) self.urlparts[2] += '/' self.url = urlutil.urlunsplit(self.urlparts) - self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + ['']) def add_size_info(self): """Get size of file content and modification time from filename path.""" @@ -209,7 +201,7 @@ class FileUrl(urlbase.UrlBase): if self.is_directory(): self.set_result(_("directory")) else: - url = fileutil.path_safe(self.url_without_anchor) + url = fileutil.path_safe(self.url) self.url_connection = urllib.request.urlopen(url) self.check_case_sensitivity() @@ -275,7 +267,7 @@ class FileUrl(urlbase.UrlBase): """ if self.is_directory(): return True - if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor): + if firefox.has_sqlite and firefox.extension.search(self.url): return True return self.is_content_type_parseable() @@ -283,8 +275,7 @@ class FileUrl(urlbase.UrlBase): """Set URL content type, or an empty string if content type could not be found.""" if self.url: - self.content_type = mimeutil.guess_mimetype( - self.url_without_anchor, read=self.get_content) + self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content) else: self.content_type = "" log.debug(LOG_CHECK, "MIME type: %s", self.content_type) @@ -314,3 +305,73 @@ class FileUrl(urlbase.UrlBase): url = webroot + url[1:] log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url) super().add_url(url, line=line, column=column, page=page, name=name, base=base) + + +class AnchorCheckFileUrl(FileUrl): + def reset(self): + super().reset() + # the local file URI + self.url_without_anchor = None + + def build_url(self): + """ + Calls UrlBase.build_url() and adds a trailing slash to directories. + """ + self.build_base_url() + if self.parent_url is not None: + # URL joining with the parent URL only works if the query + # of the base URL are removed first. + # Otherwise the join function thinks the query is part of + # the file name. + urlparts = list(urllib.parse.urlsplit(self.base_url)) + # ignore query part for filesystem urls + urlparts[3] = '' + self.base_url = urlutil.urlunsplit(urlparts) + super(FileUrl, self).build_url() + # ignore query url part for filesystem urls + self.urlparts[3] = '' + if self.is_directory() and not self.urlparts[2].endswith('/'): + self.add_warning( + _("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH + ) + self.urlparts[2] += '/' + self.url = urlutil.urlunsplit(self.urlparts) + self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + ['']) + + def check_connection(self): + """ + Try to open the local file. Under NT systems the case sensitivity + is checked. + """ + if self.parent_url is not None and not self.parent_url.startswith("file:"): + msg = _( + "local files are only checked without parent URL or when" + " the parent URL is also a file" + ) + raise LinkCheckerError(msg) + if self.is_directory(): + if self.anchor: + self.add_warning( + _( + f" URL `{self.url}' is a directory with an anchor." + " When checking local files AnchorCheck does not support" + " anchors for directories." + ), + tag=WARN_FILE_ANCHORCHECK_DIRECTORY, + ) + + self.set_result(_("directory")) + else: + url = fileutil.path_safe(self.url_without_anchor) + self.url_connection = urllib.request.urlopen(url) + self.check_case_sensitivity() + + def set_content_type(self): + """Set URL content type, or an empty string if content + type could not be found.""" + if self.url: + self.content_type = mimeutil.guess_mimetype( + self.url_without_anchor, read=self.get_content) + else: + self.content_type = "" + log.debug(LOG_CHECK, "MIME type: %s", self.content_type)