mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
381 lines
13 KiB
Python
381 lines
13 KiB
Python
# Copyright (C) 2000-2014 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
"""
|
|
Handle local file: links.
|
|
"""
|
|
|
|
import re
|
|
import os
|
|
import urllib.parse
|
|
import urllib.request
|
|
from datetime import datetime, timezone
|
|
|
|
from . import urlbase, get_index_html
|
|
from .. import log, LOG_CHECK, fileutil, mimeutil, LinkCheckerError, url as urlutil
|
|
from ..bookmarks import firefox
|
|
from .const import (WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH,
|
|
WARN_FILE_ANCHORCHECK_DIRECTORY)
|
|
|
|
|
|
def get_files(dirname):
|
|
"""Get iterator of entries in directory. Only allows regular files
|
|
and directories, no symlinks."""
|
|
for entry in os.listdir(dirname):
|
|
fullentry = os.path.join(dirname, entry)
|
|
if os.path.islink(fullentry):
|
|
continue
|
|
if os.path.isfile(fullentry):
|
|
yield entry
|
|
elif os.path.isdir(fullentry):
|
|
yield entry + "/"
|
|
|
|
|
|
def prepare_urlpath_for_nt(path):
|
|
"""
|
|
URLs like *file://server/path/* result in a path named */server/path*.
|
|
However urllib.url2pathname expects *////server/path*.
|
|
"""
|
|
if '|' not in path:
|
|
return "////" + path.lstrip("/")
|
|
return path
|
|
|
|
|
|
def get_nt_filename(path):
|
|
"""Return case sensitive filename for NT path."""
|
|
unc, rest = os.path.splitdrive(path)
|
|
head, tail = os.path.split(rest)
|
|
if not tail:
|
|
return path
|
|
for fname in os.listdir(unc + head):
|
|
if fname.lower() == tail.lower():
|
|
return os.path.join(get_nt_filename(unc + head), fname)
|
|
log.error(LOG_CHECK, "could not find %r in %r", tail, head)
|
|
return path
|
|
|
|
|
|
def get_os_filename(path):
|
|
"""Return filesystem path for given URL path."""
|
|
if os.name == 'nt':
|
|
path = prepare_urlpath_for_nt(path)
|
|
res = urllib.request.url2pathname(fileutil.path_safe(path))
|
|
if os.name == 'nt' and res.endswith(':') and len(res) == 2:
|
|
# Work around https://bugs.python.org/issue11474
|
|
res += os.sep
|
|
return res
|
|
|
|
|
|
def is_absolute_path(path):
|
|
"""Check if given path is absolute. On Windows absolute paths start
|
|
with a drive letter. On all other systems absolute paths start with
|
|
a slash."""
|
|
if os.name == 'nt':
|
|
if re.search(r"^[a-zA-Z]:", path):
|
|
return True
|
|
path = path.replace("\\", "/")
|
|
return path.startswith("/")
|
|
|
|
|
|
class FileUrl(urlbase.UrlBase):
|
|
"""
|
|
Url link with file scheme.
|
|
"""
|
|
|
|
def init(
|
|
self,
|
|
base_ref,
|
|
base_url,
|
|
parent_url,
|
|
recursion_level,
|
|
aggregate,
|
|
line,
|
|
column,
|
|
page,
|
|
name,
|
|
url_encoding,
|
|
extern,
|
|
):
|
|
"""Initialize the scheme."""
|
|
super().init(
|
|
base_ref,
|
|
base_url,
|
|
parent_url,
|
|
recursion_level,
|
|
aggregate,
|
|
line,
|
|
column,
|
|
page,
|
|
name,
|
|
url_encoding,
|
|
extern,
|
|
)
|
|
self.scheme = 'file'
|
|
|
|
def build_base_url(self):
|
|
"""The URL is normed according to the platform:
|
|
- the base URL is made an absolute *file://* URL
|
|
- under Windows platform the drive specifier is normed
|
|
"""
|
|
if self.base_url is None:
|
|
return
|
|
base_url = self.base_url
|
|
if not (self.parent_url or self.base_ref or base_url.startswith("file:")):
|
|
base_url = os.path.expanduser(base_url)
|
|
if not is_absolute_path(base_url):
|
|
try:
|
|
base_url = os.getcwd() + "/" + base_url
|
|
except OSError as msg:
|
|
# occurs on stale remote filesystems (eg. NFS)
|
|
errmsg = _(
|
|
"Could not get current working directory: %(msg)s"
|
|
) % dict(msg=msg)
|
|
raise LinkCheckerError(errmsg)
|
|
if os.path.isdir(base_url):
|
|
base_url += "/"
|
|
base_url = "file://" + base_url
|
|
if os.name == "nt":
|
|
base_url = base_url.replace("\\", "/")
|
|
# transform c:/windows into /c|/windows
|
|
base_url = re.sub("^file://(/?)([a-zA-Z]):", r"file:///\2|", base_url)
|
|
# transform file://path into file:///path
|
|
base_url = re.sub("^file://([^/])", r"file:///\1", base_url)
|
|
self.base_url = base_url
|
|
|
|
def build_url(self):
|
|
"""
|
|
Calls super.build_url() and adds a trailing slash to directories.
|
|
"""
|
|
self.build_base_url()
|
|
if self.parent_url is not None:
|
|
# URL joining with the parent URL only works if the query
|
|
# of the base URL are removed first.
|
|
# Otherwise the join function thinks the query is part of
|
|
# the file name.
|
|
urlparts = list(urllib.parse.urlsplit(self.base_url))
|
|
# ignore query part for filesystem urls
|
|
urlparts[3] = ''
|
|
self.base_url = urlutil.urlunsplit(urlparts)
|
|
super().build_url()
|
|
# ignore query and fragment url parts for filesystem urls
|
|
self.urlparts[3] = self.urlparts[4] = ''
|
|
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
|
self.add_warning(
|
|
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
|
|
)
|
|
self.urlparts[2] += '/'
|
|
self.url = urlutil.urlunsplit(self.urlparts)
|
|
|
|
def add_size_info(self):
|
|
"""Get size of file content and modification time from filename path."""
|
|
if self.is_directory():
|
|
# Directory size always differs from the customer index.html
|
|
# that is generated. So return without calculating any size.
|
|
return
|
|
filename = self.get_os_filename()
|
|
self.size = fileutil.get_size(filename)
|
|
self.modified = datetime.fromtimestamp(
|
|
fileutil.get_mtime(filename), tz=timezone.utc)
|
|
|
|
def check_connection(self):
|
|
"""
|
|
Try to open the local file. Under NT systems the case sensitivity
|
|
is checked.
|
|
"""
|
|
if self.parent_url is not None and not self.parent_url.startswith("file:"):
|
|
msg = _(
|
|
"local files are only checked without parent URL or when"
|
|
" the parent URL is also a file"
|
|
)
|
|
raise LinkCheckerError(msg)
|
|
if self.is_directory():
|
|
self.set_result(_("directory"))
|
|
else:
|
|
url = fileutil.path_safe(self.url)
|
|
self.url_connection = urllib.request.urlopen(url)
|
|
self.check_case_sensitivity()
|
|
|
|
def check_case_sensitivity(self):
|
|
"""
|
|
Check if url and windows path name match cases
|
|
else there might be problems when copying such
|
|
files on web servers that are case sensitive.
|
|
"""
|
|
if os.name != 'nt':
|
|
return
|
|
path = self.get_os_filename()
|
|
realpath = get_nt_filename(path)
|
|
if path != realpath:
|
|
self.add_warning(
|
|
_(
|
|
"The URL path %(path)r is not the same as the"
|
|
" system path %(realpath)r. You should always use"
|
|
" the system path in URLs."
|
|
)
|
|
% {"path": path, "realpath": realpath},
|
|
tag=WARN_FILE_SYSTEM_PATH,
|
|
)
|
|
|
|
def read_content(self):
|
|
"""Return file content, or in case of directories a dummy HTML file
|
|
with links to the files."""
|
|
if self.is_directory():
|
|
data = get_index_html(get_files(self.get_os_filename()))
|
|
else:
|
|
data = super().read_content()
|
|
return data
|
|
|
|
def get_os_filename(self):
|
|
"""
|
|
Construct os specific file path out of the *file://* URL.
|
|
|
|
@return: file name
|
|
@rtype: string
|
|
"""
|
|
return get_os_filename(self.urlparts[2])
|
|
|
|
def get_temp_filename(self):
|
|
"""Get filename for content to parse."""
|
|
return self.get_os_filename()
|
|
|
|
def is_directory(self):
|
|
"""
|
|
Check if file is a directory.
|
|
|
|
@return: True iff file is a directory
|
|
@rtype: bool
|
|
"""
|
|
filename = self.get_os_filename()
|
|
return os.path.isdir(filename) and not os.path.islink(filename)
|
|
|
|
def is_parseable(self):
|
|
"""Check if content is parseable for recursion.
|
|
|
|
@return: True if content is parseable
|
|
@rtype: bool
|
|
"""
|
|
if self.is_directory():
|
|
return True
|
|
if firefox.has_sqlite and firefox.extension.search(self.url):
|
|
return True
|
|
return self.is_content_type_parseable()
|
|
|
|
def set_content_type(self):
|
|
"""Set URL content type, or an empty string if content
|
|
type could not be found."""
|
|
if self.url:
|
|
self.content_type = mimeutil.guess_mimetype(self.url, read=self.get_content)
|
|
else:
|
|
self.content_type = ""
|
|
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
|
|
|
|
def get_intern_pattern(self, url=None):
|
|
"""Get pattern for intern URL matching.
|
|
|
|
@return non-empty regex pattern or None
|
|
@rtype String or None
|
|
"""
|
|
if url is None:
|
|
url = self.url
|
|
if not url:
|
|
return None
|
|
if url.startswith('file://'):
|
|
i = url.rindex('/')
|
|
if i > 6:
|
|
# remove last filename to make directory internal
|
|
url = url[: i + 1]
|
|
return re.escape(url)
|
|
|
|
def add_url(self, url, line=0, column=0, page=0, name="", base=None):
|
|
"""If a local webroot directory is configured, replace absolute URLs
|
|
with it. After that queue the URL data for checking."""
|
|
webroot = self.aggregate.config["localwebroot"]
|
|
if webroot and url and url.startswith("/"):
|
|
url = webroot + url[1:]
|
|
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
|
|
super().add_url(url, line=line, column=column, page=page, name=name, base=base)
|
|
|
|
|
|
class AnchorCheckFileUrl(FileUrl):
|
|
"""
|
|
File URL link for AnchorCheck plugin.
|
|
"""
|
|
|
|
def reset(self):
|
|
super().reset()
|
|
# the local file URI
|
|
self.url_without_anchor = None
|
|
|
|
def build_url(self):
|
|
"""
|
|
Calls UrlBase.build_url() and adds a trailing slash to directories.
|
|
"""
|
|
self.build_base_url()
|
|
if self.parent_url is not None:
|
|
# URL joining with the parent URL only works if the query
|
|
# of the base URL are removed first.
|
|
# Otherwise the join function thinks the query is part of
|
|
# the file name.
|
|
urlparts = list(urllib.parse.urlsplit(self.base_url))
|
|
# ignore query part for filesystem urls
|
|
urlparts[3] = ''
|
|
self.base_url = urlutil.urlunsplit(urlparts)
|
|
super(FileUrl, self).build_url()
|
|
# ignore query url part for filesystem urls
|
|
self.urlparts[3] = ''
|
|
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
|
self.add_warning(
|
|
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
|
|
)
|
|
self.urlparts[2] += '/'
|
|
self.url = urlutil.urlunsplit(self.urlparts)
|
|
self.url_without_anchor = urlutil.urlunsplit(self.urlparts[:4] + [''])
|
|
|
|
def check_connection(self):
|
|
"""
|
|
Try to open the local file. Under NT systems the case sensitivity
|
|
is checked.
|
|
"""
|
|
if self.parent_url is not None and not self.parent_url.startswith("file:"):
|
|
msg = _(
|
|
"local files are only checked without parent URL or when"
|
|
" the parent URL is also a file"
|
|
)
|
|
raise LinkCheckerError(msg)
|
|
if self.is_directory():
|
|
if self.anchor:
|
|
self.add_warning(
|
|
_(
|
|
"URL `%s' is a directory with an anchor."
|
|
" When checking local files AnchorCheck does not support"
|
|
" anchors for directories."
|
|
) % self.url,
|
|
tag=WARN_FILE_ANCHORCHECK_DIRECTORY,
|
|
)
|
|
|
|
self.set_result(_("directory"))
|
|
else:
|
|
url = fileutil.path_safe(self.url_without_anchor)
|
|
self.url_connection = urllib.request.urlopen(url)
|
|
self.check_case_sensitivity()
|
|
|
|
def set_content_type(self):
|
|
"""Set URL content type, or an empty string if content
|
|
type could not be found."""
|
|
if self.url:
|
|
self.content_type = mimeutil.guess_mimetype(
|
|
self.url_without_anchor, read=self.get_content)
|
|
else:
|
|
self.content_type = ""
|
|
log.debug(LOG_CHECK, "MIME type: %s", self.content_type)
|