From 16bee500687de01cf8aef6717adc5d4d5cfffc3c Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 24 Oct 2022 19:30:56 +0100 Subject: [PATCH] Move AnchorCheck local file handling into a new class When checking local files with AnchorCheck, anchors in URLs like "example/#anchor" are not supported. Without AnchorCheck enabled, the Real URL reported for such URLs was changed to include the anchor when local file checking was added to AnchorCheck, but it is the directory that is checked. The same URL was also then used as the Parent URL for the check of each of the contents of that directory. For FileUrl this is a revert of: c221afda ("Enable AnchorCheck to be used with local files", 2022-10-03) --- doc/src/man/linkcheckerrc.rst | 6 ++- linkcheck/checker/__init__.py | 3 ++ linkcheck/checker/const.py | 3 ++ linkcheck/checker/fileurl.py | 93 +++++++++++++++++++++++++++++------ 4 files changed, 88 insertions(+), 17 deletions(-) diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index b9b4087f..cb3be90f 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -475,7 +475,9 @@ options in their section. AnchorCheck ^^^^^^^^^^^ -Checks validity of HTML anchors. +Checks validity of HTML anchors. When checking local files, URLs with anchors +that link to directories e.g. "example/#anchor" are not supported. There is no +such limitation when using http(s). LocationInfo ^^^^^^^^^^^^ @@ -568,6 +570,8 @@ WARNINGS The following warnings are recognized in the 'ignorewarnings' config file entry: +**file-anchorcheck-directory** + A local directory with an anchor, not supported by AnchorCheck. **file-missing-slash** The file: URL is missing a trailing slash. **file-system-path** diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 075075ce..dd0a28f5 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -131,6 +131,9 @@ def get_url_from( else: assume_local_file = recursion_level == 0 klass = get_urlclass_from(scheme, assume_local_file=assume_local_file) + if "AnchorCheck" in aggregate.config["enabledplugins"] and \ + klass == fileurl.FileUrl: + klass = fileurl.AnchorCheckFileUrl log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url) return klass( base_url, diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index ac60dddd..bf4a8be6 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -91,6 +91,7 @@ WARN_URL_TOO_LONG = "url-too-long" WARN_URL_WHITESPACE = "url-whitespace" WARN_FILE_MISSING_SLASH = "file-missing-slash" WARN_FILE_SYSTEM_PATH = "file-system-path" +WARN_FILE_ANCHORCHECK_DIRECTORY = "file-anchorcheck-directory" WARN_FTP_MISSING_SLASH = "ftp-missing-slash" WARN_HTTP_EMPTY_CONTENT = "http-empty-content" WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error" @@ -113,6 +114,8 @@ Warnings = { WARN_FILE_SYSTEM_PATH: _( "The file: path is not the same as the system specific path." ), + WARN_FILE_ANCHORCHECK_DIRECTORY: _( + "A local directory with an anchor, not supported by AnchorCheck."), WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."), WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."), WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."), diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 868201d6..09a96139 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -26,7 +26,8 @@ from datetime import datetime from . import urlbase, get_index_html from .. import log, LOG_CHECK, fileutil, mimeutil, LinkCheckerError, url as urlutil from ..bookmarks import firefox -from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH +from .const import (WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, + WARN_FILE_ANCHORCHECK_DIRECTORY) def get_files(dirname): @@ -122,14 +123,6 @@ class FileUrl(urlbase.UrlBase): ) self.scheme = 'file' - def reset(self): - super().reset() - # the local file URI - self.url_without_anchor = None - # including the anchor in self.url allows the AnchorCheck plugin to be - # used when checking files. The anchor is stripped in UrlBase.set_cache_url() - # if AnchorCheck is not being used. - def build_base_url(self): """The URL is normed according to the platform: - the base URL is made an absolute *file://* URL @@ -175,15 +168,14 @@ class FileUrl(urlbase.UrlBase): urlparts[3] = '' self.base_url = urlutil.urlunsplit(urlparts) super().build_url() - # ignore query url part for filesystem urls - self.urlparts[3] = '' + # ignore query and fragment url parts for filesystem urls + self.urlparts[3] = self.urlparts[4] = '' if self.is_directory() and not self.urlparts[2].endswith('/'): self.add_warning( _("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH ) self.urlparts[2] += '/' self.url = urlutil.urlunsplit(self.urlparts) - self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + ['']) def add_size_info(self): """Get size of file content and modification time from filename path.""" @@ -209,7 +201,7 @@ class FileUrl(urlbase.UrlBase): if self.is_directory(): self.set_result(_("directory")) else: - url = fileutil.path_safe(self.url_without_anchor) + url = fileutil.path_safe(self.url) self.url_connection = urllib.request.urlopen(url) self.check_case_sensitivity() @@ -275,7 +267,7 @@ class FileUrl(urlbase.UrlBase): """ if self.is_directory(): return True - if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor): + if firefox.has_sqlite and firefox.extension.search(self.url): return True return self.is_content_type_parseable() @@ -283,8 +275,7 @@ class FileUrl(urlbase.UrlBase): """Set URL content type, or an empty string if content type could not be found.""" if self.url: - self.content_type = mimeutil.guess_mimetype( - self.url_without_anchor, read=self.get_content) + self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content) else: self.content_type = "" log.debug(LOG_CHECK, "MIME type: %s", self.content_type) @@ -314,3 +305,73 @@ class FileUrl(urlbase.UrlBase): url = webroot + url[1:] log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url) super().add_url(url, line=line, column=column, page=page, name=name, base=base) + + +class AnchorCheckFileUrl(FileUrl): + def reset(self): + super().reset() + # the local file URI + self.url_without_anchor = None + + def build_url(self): + """ + Calls UrlBase.build_url() and adds a trailing slash to directories. + """ + self.build_base_url() + if self.parent_url is not None: + # URL joining with the parent URL only works if the query + # of the base URL are removed first. + # Otherwise the join function thinks the query is part of + # the file name. + urlparts = list(urllib.parse.urlsplit(self.base_url)) + # ignore query part for filesystem urls + urlparts[3] = '' + self.base_url = urlutil.urlunsplit(urlparts) + super(FileUrl, self).build_url() + # ignore query url part for filesystem urls + self.urlparts[3] = '' + if self.is_directory() and not self.urlparts[2].endswith('/'): + self.add_warning( + _("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH + ) + self.urlparts[2] += '/' + self.url = urlutil.urlunsplit(self.urlparts) + self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + ['']) + + def check_connection(self): + """ + Try to open the local file. Under NT systems the case sensitivity + is checked. + """ + if self.parent_url is not None and not self.parent_url.startswith("file:"): + msg = _( + "local files are only checked without parent URL or when" + " the parent URL is also a file" + ) + raise LinkCheckerError(msg) + if self.is_directory(): + if self.anchor: + self.add_warning( + _( + f" URL `{self.url}' is a directory with an anchor." + " When checking local files AnchorCheck does not support" + " anchors for directories." + ), + tag=WARN_FILE_ANCHORCHECK_DIRECTORY, + ) + + self.set_result(_("directory")) + else: + url = fileutil.path_safe(self.url_without_anchor) + self.url_connection = urllib.request.urlopen(url) + self.check_case_sensitivity() + + def set_content_type(self): + """Set URL content type, or an empty string if content + type could not be found.""" + if self.url: + self.content_type = mimeutil.guess_mimetype( + self.url_without_anchor, read=self.get_content) + else: + self.content_type = "" + log.debug(LOG_CHECK, "MIME type: %s", self.content_type)