linkchecker/linkcheck/fileutil.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
File and path utilities.
"""

import os
import re
import locale
import stat
import fnmatch
import mimetypes


def write_file (filename, content, backup=False, callback=None):
    """Overwrite a possibly existing file with new content. Do this
    in a manner that does not leave truncated or broken files behind.
    @param filename: name of file to write
    @type filename: string
    @param content: file content to write
    @type content: string
    @param backup: if backup file should be left
    @type backup: bool
    @param callback: non-default storage function
    @type callback: None or function taking two parameters (fileobj, content)
    """
    # first write in a temp file
    f = file(filename+".tmp", 'wb')
    if callback is None:
        f.write(content)
    else:
        callback(f, content)
    f.close()
    # move orig file to backup
    if os.path.exists(filename):
        os.rename(filename, filename+".bak")
    # move temp file to orig
    os.rename(filename+".tmp", filename)
    # remove backup
    if not backup and os.path.exists(filename+".bak"):
        os.remove(filename+".bak")


def has_module (name):
    """Test if given module can be imported.
    @return: flag if import is successful
    @rtype: bool
    """
    try:
        exec "import %s" % name
        return True
    except ImportError:
        return False


class GlobDirectoryWalker (object):
    """A forward iterator that traverses a directory tree."""

    def __init__ (self, directory, pattern="*"):
        """Set start directory and pattern matcher."""
        self.stack = [directory]
        self.pattern = pattern
        self.files = []
        self.index = 0

    def __getitem__ (self, index):
        """Search for next filename."""
        while True:
            try:
                filename = self.files[self.index]
                self.index += 1
            except IndexError:
                # Pop next directory from stack. This effectively
                # stops the iteration if stack is empty.
                self.directory = self.stack.pop()
                self.files = os.listdir(self.directory)
                self.index = 0
            else:
                # got a filename
                fullname = os.path.join(self.directory, filename)
                if os.path.isdir(fullname) and not os.path.islink(fullname):
                    self.stack.append(fullname)
                if fnmatch.fnmatch(filename, self.pattern):
                    return fullname

# alias
rglob = GlobDirectoryWalker


class Buffer (object):
    """Holds buffered data"""

    def __init__ (self, empty=''):
        """Initialize buffer."""
        self.empty = self.buf = empty
        self.tmpbuf = []
        self.pos = 0

    def __len__ (self):
        """Buffer length."""
        return self.pos

    def write (self, data):
        """Write data to buffer."""
        self.tmpbuf.append(data)
        self.pos += len(data)

    def flush (self, overlap=0):
        """Flush buffered data and return it."""
        self.buf += self.empty.join(self.tmpbuf)
        self.tmpbuf = []
        if overlap and overlap < self.pos:
            data = self.buf[:-overlap]
            self.buf = self.buf[-overlap:]
        else:
            data = self.buf
            self.buf = self.empty
        return data


def get_mtime (filename):
    """Return modification time of filename or zero on errors."""
    try:
        return os.stat(filename)[stat.ST_MTIME]
    except os.error:
        return 0


def get_size (filename):
    """Return file size in Bytes, or -1 on error."""
    try:
        return os.stat(filename)[stat.ST_SIZE]
    except os.error:
        return -1


# http://developer.gnome.org/doc/API/2.0/glib/glib-running.html
if "G_FILENAME_ENCODING" in os.environ:
    FSCODING = os.environ["G_FILENAME_ENCODING"].split(",")[0]
    if FSCODING == "@locale":
        FSCODING = locale.getpreferredencoding()
elif "G_BROKEN_FILENAMES" in os.environ:
    FSCODING = locale.getpreferredencoding()
else:
    FSCODING = "utf-8"

def pathencode (path):
    if isinstance(path, unicode) and not os.path.supports_unicode_filenames:
        path = path.encode(FSCODING, "replace")
    return path


# cache for modified check {absolute filename -> mtime}
_mtime_cache = {}
def has_changed (filename):
    """Check if filename has changed since the last check. If this
    is the first check, assume the file is changed."""
    key = os.path.abspath(filename)
    mtime = get_mtime(key)
    if key not in _mtime_cache:
        _mtime_cache[key] = mtime
        return True
    return mtime > _mtime_cache[key]


mimedb = mimetypes.MimeTypes(strict=False)

# if file extension lookup was unsuccessful, look at the content
PARSE_CONTENTS = {
    "text/html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
    "text/plain+opera": re.compile(r'^Opera Hotlist'),
    "text/plain+linkchecker": re.compile(r'(?i)^# LinkChecker URL list'),
}

def guess_mimetype (filename, read=None):
    """Return MIME type of file, or 'application/octet-stream' if it could
    not be determined."""
    mime, encoding = mimedb.guess_type(filename, strict=False)
    # Mime type text/plain can be differentiated further with content reading.
    if (mime == "text/plain" or not mime) and read is not None:
        # try to read some content and do a poor man's file(1)
        # XXX replace with file(1) on Unix systems
        try:
            data = read()[:30]
            for mime, ro in PARSE_CONTENTS.items():
                if ro.search(data):
                    break
        except Exception:
            pass
    if not mime:
        mime = "application/octet-stream"
    elif ";" in mime:
        # split off not needed extension info
        mime = mime.split(';')[0]
    return mime.strip().lower()