mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-21 16:30:28 +00:00
299 lines
10 KiB
Python
299 lines
10 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2000-2010 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
"""
|
|
Handle local file: links.
|
|
"""
|
|
|
|
import re
|
|
import os
|
|
import urlparse
|
|
import urllib
|
|
import urllib2
|
|
|
|
from . import urlbase, get_index_html, absolute_url, get_url_from
|
|
from .. import log, LOG_CHECK, fileutil, strformat, url as urlutil
|
|
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
|
|
|
|
try:
|
|
import sqlite3
|
|
has_sqlite = True
|
|
except ImportError:
|
|
has_sqlite = False
|
|
|
|
|
|
firefox_extension = re.compile(r'/(?i)places.sqlite$')
|
|
|
|
|
|
def get_files (dirname):
|
|
"""Get iterator of entries in directory. Only allows regular files
|
|
and directories, no symlinks."""
|
|
for entry in os.listdir(dirname):
|
|
fullentry = os.path.join(dirname, entry)
|
|
if os.path.islink(fullentry):
|
|
continue
|
|
if os.path.isfile(fullentry):
|
|
yield entry
|
|
elif os.path.isdir(fullentry):
|
|
yield entry+"/"
|
|
|
|
|
|
def prepare_urlpath_for_nt (path):
|
|
"""
|
|
URLs like 'file://server/path/' result in a path named '/server/path'.
|
|
However urllib.url2pathname expects '////server/path'.
|
|
"""
|
|
if '|' not in path:
|
|
return "////"+path.lstrip("/")
|
|
return path
|
|
|
|
|
|
def get_nt_filename (path):
|
|
"""
|
|
Return case sensitive filename for NT path.
|
|
"""
|
|
head, tail = os.path.split(path)
|
|
if not tail:
|
|
return path
|
|
for fname in os.listdir(head):
|
|
if fname.lower() == tail.lower():
|
|
return os.path.join(get_nt_filename(head), fname)
|
|
log.error(LOG_CHECK, "could not find %r in %r", tail, head)
|
|
return path
|
|
|
|
|
|
def is_absolute_path (path):
|
|
"""Check if given path is absolute. On Windows absolute paths start
|
|
with a drive letter. On all other systems absolute paths start with
|
|
a slash."""
|
|
if os.name == 'nt':
|
|
return re.search(r"^[a-zA-Z]:", path)
|
|
return path.startswith("/")
|
|
|
|
|
|
class FileUrl (urlbase.UrlBase):
|
|
"""
|
|
Url link with file scheme.
|
|
"""
|
|
|
|
def init (self, base_ref, base_url, parent_url, recursion_level,
|
|
aggregate, line, column, name, url_encoding):
|
|
"""
|
|
Besides the usual initialization the URL is normed according
|
|
to the platform:
|
|
- the base URL is made an absolute file:// URL
|
|
- under Windows platform the drive specifier is normed
|
|
"""
|
|
super(FileUrl, self).init(base_ref, base_url, parent_url,
|
|
recursion_level, aggregate, line, column, name, url_encoding)
|
|
if self.base_url is None:
|
|
return
|
|
base_url = self.base_url
|
|
if not (parent_url or base_ref or base_url.startswith("file:")):
|
|
base_url = os.path.expanduser(base_url)
|
|
if not is_absolute_path(base_url):
|
|
base_url = os.getcwd()+"/"+base_url
|
|
if os.path.isdir(base_url):
|
|
base_url += "/"
|
|
base_url = "file://"+base_url
|
|
if os.name == "nt":
|
|
base_url = base_url.replace("\\", "/")
|
|
# transform c:/windows into /c|/windows
|
|
base_url = re.sub("^file://(/?)([a-zA-Z]):", r"file:///\2|", base_url)
|
|
# transform file://path into file:///path
|
|
base_url = re.sub("^file://([^/])", r"file:///\1", base_url)
|
|
self.base_url = unicode(base_url)
|
|
|
|
def build_url (self):
|
|
"""
|
|
Calls super.build_url() and adds a trailing slash to directories.
|
|
"""
|
|
super(FileUrl, self).build_url()
|
|
# ignore query and fragment url parts for filesystem urls
|
|
self.urlparts[3] = self.urlparts[4] = ''
|
|
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
|
self.add_warning(_("Added trailing slash to directory."),
|
|
tag=WARN_FILE_MISSING_SLASH)
|
|
self.urlparts[2] += '/'
|
|
self.url = urlparse.urlunsplit(self.urlparts)
|
|
|
|
def add_size_info (self):
|
|
"""Get size of file content from filename path."""
|
|
if self.is_directory():
|
|
# Directory size always differs from the customer index.html
|
|
# that is generated. So return without calculating any size.
|
|
return
|
|
self.size = fileutil.get_size(self.get_os_filename())
|
|
if self.dlsize == -1:
|
|
self.dlsize = self.size
|
|
|
|
def check_connection (self):
|
|
"""
|
|
Try to open the local file. Under NT systems the case sensitivity
|
|
is checked.
|
|
"""
|
|
if self.is_directory():
|
|
self.set_result(_("directory"))
|
|
else:
|
|
url = fileutil.pathencode(self.url)
|
|
self.url_connection = urllib2.urlopen(url)
|
|
self.check_case_sensitivity()
|
|
|
|
def check_case_sensitivity (self):
|
|
"""
|
|
Check if url and windows path name match cases
|
|
else there might be problems when copying such
|
|
files on web servers that are case sensitive.
|
|
"""
|
|
if os.name != 'nt':
|
|
return
|
|
path = self.get_os_filename()
|
|
realpath = get_nt_filename(path)
|
|
if path != realpath:
|
|
self.add_warning(_("The URL path %(path)r is not the same as the "
|
|
"system path %(realpath)r. You should always use "
|
|
"the system path in URLs.") % \
|
|
{"path": path, "realpath": realpath},
|
|
tag=WARN_FILE_SYSTEM_PATH)
|
|
|
|
def read_content (self):
|
|
"""Return file content, or in case of directories a dummy HTML file
|
|
with links to the files."""
|
|
if self.is_directory():
|
|
data = get_index_html(get_files(self.get_os_filename()))
|
|
if isinstance(data, unicode):
|
|
data = data.encode("iso8859-1", "ignore")
|
|
size = len(data)
|
|
else:
|
|
data, size = super(FileUrl, self).read_content()
|
|
return data, size
|
|
|
|
def is_html (self):
|
|
"""Check if file is a HTML file."""
|
|
mime = fileutil.guess_mimetype(self.url, read=self.get_content)
|
|
return self.ContentMimetypes.get(mime) == "html"
|
|
|
|
def is_css (self):
|
|
"""
|
|
Check if file is a CSS file.
|
|
"""
|
|
mime = fileutil.guess_mimetype(self.url, read=self.get_content)
|
|
return self.ContentMimetypes.get(mime) == "css"
|
|
|
|
def is_file (self):
|
|
"""
|
|
This is a file.
|
|
|
|
@return: True
|
|
@rtype: bool
|
|
"""
|
|
return True
|
|
|
|
def get_os_filename (self):
|
|
"""
|
|
Construct os specific file path out of the file:// URL.
|
|
|
|
@return: file name
|
|
@rtype: string
|
|
"""
|
|
path = self.urlparts[2]
|
|
if os.name == 'nt':
|
|
path = prepare_urlpath_for_nt(path)
|
|
return fileutil.pathencode(urllib.url2pathname(path))
|
|
|
|
def get_temp_filename (self):
|
|
"""Get filename for content to parse."""
|
|
return self.get_os_filename()
|
|
|
|
def is_directory (self):
|
|
"""
|
|
Check if file is a directory.
|
|
|
|
@return: True iff file is a directory
|
|
@rtype: bool
|
|
"""
|
|
filename = self.get_os_filename()
|
|
return os.path.isdir(filename) and not os.path.islink(filename)
|
|
|
|
def is_parseable (self):
|
|
"""
|
|
Check if content is parseable for recursion.
|
|
|
|
@return: True if content is parseable
|
|
@rtype: bool
|
|
"""
|
|
if self.is_directory():
|
|
return True
|
|
elif has_sqlite and firefox_extension.search(self.url):
|
|
return True
|
|
else:
|
|
mime = fileutil.guess_mimetype(self.url, read=self.get_content)
|
|
return mime in self.ContentMimetypes
|
|
|
|
def parse_url (self):
|
|
"""
|
|
Parse file contents for new links to check.
|
|
"""
|
|
if self.is_directory():
|
|
self.parse_html()
|
|
elif has_sqlite and firefox_extension.search(self.url):
|
|
self.parse_firefox()
|
|
else:
|
|
mime = fileutil.guess_mimetype(self.url, read=self.get_content)
|
|
key = self.ContentMimetypes[mime]
|
|
getattr(self, "parse_"+key)()
|
|
|
|
def parse_firefox (self):
|
|
"""Parse a Firefox3 bookmark file."""
|
|
log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
|
|
conn = sqlite3.connect(self.get_os_filename(), timeout=0.5)
|
|
try:
|
|
c = conn.cursor()
|
|
try:
|
|
sql = """SELECT mp.url, mb.title
|
|
FROM moz_places mp, moz_bookmarks mb
|
|
WHERE mp.hidden=0 AND mp.url NOT LIKE 'place:%' AND
|
|
mp.id=mb.fk"""
|
|
c.execute(sql)
|
|
for url, name in c:
|
|
if not name:
|
|
name = url
|
|
url_data = get_url_from(url, self.recursion_level+1,
|
|
self.aggregate, parent_url=self.url, name=name)
|
|
self.aggregate.urlqueue.put(url_data)
|
|
finally:
|
|
c.close()
|
|
finally:
|
|
conn.close()
|
|
|
|
def get_intern_pattern (self):
|
|
"""
|
|
Get pattern for intern URL matching.
|
|
|
|
@return non-empty regex pattern or None
|
|
@rtype String or None
|
|
"""
|
|
url = absolute_url(self.base_url, self.base_ref, self.parent_url)
|
|
if not url:
|
|
return None
|
|
parts = strformat.url_unicode_split(url)
|
|
path = urlutil.splitparams(parts[2])[0]
|
|
segments = path.split('/')
|
|
if not self.is_directory():
|
|
# cut off filename to have a directory
|
|
segments = segments[:-1]
|
|
path = "/".join(segments)
|
|
return "file://%s" % re.escape(path)
|