mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Merge pull request #683 from cjmayo/anchorcheckfileurl
Move AnchorCheck local file handling into a new class
This commit is contained in:
commit
b796cec346
4 changed files with 88 additions and 17 deletions
|
|
@ -475,7 +475,9 @@ options in their section.
|
|||
AnchorCheck
|
||||
^^^^^^^^^^^
|
||||
|
||||
Checks validity of HTML anchors.
|
||||
Checks validity of HTML anchors. When checking local files, URLs with anchors
|
||||
that link to directories e.g. "example/#anchor" are not supported. There is no
|
||||
such limitation when using http(s).
|
||||
|
||||
LocationInfo
|
||||
^^^^^^^^^^^^
|
||||
|
|
@ -568,6 +570,8 @@ WARNINGS
|
|||
The following warnings are recognized in the 'ignorewarnings' config
|
||||
file entry:
|
||||
|
||||
**file-anchorcheck-directory**
|
||||
A local directory with an anchor, not supported by AnchorCheck.
|
||||
**file-missing-slash**
|
||||
The file: URL is missing a trailing slash.
|
||||
**file-system-path**
|
||||
|
|
|
|||
|
|
@ -131,6 +131,9 @@ def get_url_from(
|
|||
else:
|
||||
assume_local_file = recursion_level == 0
|
||||
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
|
||||
if "AnchorCheck" in aggregate.config["enabledplugins"] and \
|
||||
klass == fileurl.FileUrl:
|
||||
klass = fileurl.AnchorCheckFileUrl
|
||||
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
|
||||
return klass(
|
||||
base_url,
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ WARN_URL_TOO_LONG = "url-too-long"
|
|||
WARN_URL_WHITESPACE = "url-whitespace"
|
||||
WARN_FILE_MISSING_SLASH = "file-missing-slash"
|
||||
WARN_FILE_SYSTEM_PATH = "file-system-path"
|
||||
WARN_FILE_ANCHORCHECK_DIRECTORY = "file-anchorcheck-directory"
|
||||
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
|
||||
WARN_HTTP_EMPTY_CONTENT = "http-empty-content"
|
||||
WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
|
||||
|
|
@ -113,6 +114,8 @@ Warnings = {
|
|||
WARN_FILE_SYSTEM_PATH: _(
|
||||
"The file: path is not the same as the system specific path."
|
||||
),
|
||||
WARN_FILE_ANCHORCHECK_DIRECTORY: _(
|
||||
"A local directory with an anchor, not supported by AnchorCheck."),
|
||||
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
|
||||
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
|
||||
WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."),
|
||||
|
|
|
|||
|
|
@ -26,7 +26,8 @@ from datetime import datetime
|
|||
from . import urlbase, get_index_html
|
||||
from .. import log, LOG_CHECK, fileutil, mimeutil, LinkCheckerError, url as urlutil
|
||||
from ..bookmarks import firefox
|
||||
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
|
||||
from .const import (WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH,
|
||||
WARN_FILE_ANCHORCHECK_DIRECTORY)
|
||||
|
||||
|
||||
def get_files(dirname):
|
||||
|
|
@ -122,14 +123,6 @@ class FileUrl(urlbase.UrlBase):
|
|||
)
|
||||
self.scheme = 'file'
|
||||
|
||||
def reset(self):
|
||||
super().reset()
|
||||
# the local file URI
|
||||
self.url_without_anchor = None
|
||||
# including the anchor in self.url allows the AnchorCheck plugin to be
|
||||
# used when checking files. The anchor is stripped in UrlBase.set_cache_url()
|
||||
# if AnchorCheck is not being used.
|
||||
|
||||
def build_base_url(self):
|
||||
"""The URL is normed according to the platform:
|
||||
- the base URL is made an absolute *file://* URL
|
||||
|
|
@ -175,15 +168,14 @@ class FileUrl(urlbase.UrlBase):
|
|||
urlparts[3] = ''
|
||||
self.base_url = urlutil.urlunsplit(urlparts)
|
||||
super().build_url()
|
||||
# ignore query url part for filesystem urls
|
||||
self.urlparts[3] = ''
|
||||
# ignore query and fragment url parts for filesystem urls
|
||||
self.urlparts[3] = self.urlparts[4] = ''
|
||||
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
||||
self.add_warning(
|
||||
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
|
||||
)
|
||||
self.urlparts[2] += '/'
|
||||
self.url = urlutil.urlunsplit(self.urlparts)
|
||||
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
|
||||
|
||||
def add_size_info(self):
|
||||
"""Get size of file content and modification time from filename path."""
|
||||
|
|
@ -209,7 +201,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
if self.is_directory():
|
||||
self.set_result(_("directory"))
|
||||
else:
|
||||
url = fileutil.path_safe(self.url_without_anchor)
|
||||
url = fileutil.path_safe(self.url)
|
||||
self.url_connection = urllib.request.urlopen(url)
|
||||
self.check_case_sensitivity()
|
||||
|
||||
|
|
@ -275,7 +267,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
"""
|
||||
if self.is_directory():
|
||||
return True
|
||||
if firefox.has_sqlite and firefox.extension.search(self.url_without_anchor):
|
||||
if firefox.has_sqlite and firefox.extension.search(self.url):
|
||||
return True
|
||||
return self.is_content_type_parseable()
|
||||
|
||||
|
|
@ -283,8 +275,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
"""Set URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
if self.url:
|
||||
self.content_type = mimeutil.guess_mimetype(
|
||||
self.url_without_anchor, read=self.get_content)
|
||||
self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content)
|
||||
else:
|
||||
self.content_type = ""
|
||||
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
|
||||
|
|
@ -314,3 +305,73 @@ class FileUrl(urlbase.UrlBase):
|
|||
url = webroot + url[1:]
|
||||
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
|
||||
super().add_url(url, line=line, column=column, page=page, name=name, base=base)
|
||||
|
||||
|
||||
class AnchorCheckFileUrl(FileUrl):
|
||||
def reset(self):
|
||||
super().reset()
|
||||
# the local file URI
|
||||
self.url_without_anchor = None
|
||||
|
||||
def build_url(self):
|
||||
"""
|
||||
Calls UrlBase.build_url() and adds a trailing slash to directories.
|
||||
"""
|
||||
self.build_base_url()
|
||||
if self.parent_url is not None:
|
||||
# URL joining with the parent URL only works if the query
|
||||
# of the base URL are removed first.
|
||||
# Otherwise the join function thinks the query is part of
|
||||
# the file name.
|
||||
urlparts = list(urllib.parse.urlsplit(self.base_url))
|
||||
# ignore query part for filesystem urls
|
||||
urlparts[3] = ''
|
||||
self.base_url = urlutil.urlunsplit(urlparts)
|
||||
super(FileUrl, self).build_url()
|
||||
# ignore query url part for filesystem urls
|
||||
self.urlparts[3] = ''
|
||||
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
||||
self.add_warning(
|
||||
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
|
||||
)
|
||||
self.urlparts[2] += '/'
|
||||
self.url = urlutil.urlunsplit(self.urlparts)
|
||||
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
|
||||
|
||||
def check_connection(self):
|
||||
"""
|
||||
Try to open the local file. Under NT systems the case sensitivity
|
||||
is checked.
|
||||
"""
|
||||
if self.parent_url is not None and not self.parent_url.startswith("file:"):
|
||||
msg = _(
|
||||
"local files are only checked without parent URL or when"
|
||||
" the parent URL is also a file"
|
||||
)
|
||||
raise LinkCheckerError(msg)
|
||||
if self.is_directory():
|
||||
if self.anchor:
|
||||
self.add_warning(
|
||||
_(
|
||||
f" URL `{self.url}' is a directory with an anchor."
|
||||
" When checking local files AnchorCheck does not support"
|
||||
" anchors for directories."
|
||||
),
|
||||
tag=WARN_FILE_ANCHORCHECK_DIRECTORY,
|
||||
)
|
||||
|
||||
self.set_result(_("directory"))
|
||||
else:
|
||||
url = fileutil.path_safe(self.url_without_anchor)
|
||||
self.url_connection = urllib.request.urlopen(url)
|
||||
self.check_case_sensitivity()
|
||||
|
||||
def set_content_type(self):
|
||||
"""Set URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
if self.url:
|
||||
self.content_type = mimeutil.guess_mimetype(
|
||||
self.url_without_anchor, read=self.get_content)
|
||||
else:
|
||||
self.content_type = ""
|
||||
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
|
||||
|
|
|
|||
Loading…
Reference in a new issue