mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Move mime stuff into own submodule.
This commit is contained in:
parent
9b794b936c
commit
4b28e6e860
4 changed files with 146 additions and 98 deletions
|
|
@ -19,17 +19,14 @@ File and path utilities.
|
|||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import locale
|
||||
import stat
|
||||
import fnmatch
|
||||
import mimetypes
|
||||
import tempfile
|
||||
import importlib
|
||||
from distutils.spawn import find_executable
|
||||
|
||||
from .decorators import memoized
|
||||
from . import log, LOG_CHECK
|
||||
|
||||
def write_file (filename, content, backup=False, callback=None):
|
||||
"""Overwrite a possibly existing file with new content. Do this
|
||||
|
|
@ -187,82 +184,6 @@ def has_changed (filename):
|
|||
return mtime > _mtime_cache[key]
|
||||
|
||||
|
||||
mimedb = None
|
||||
|
||||
def init_mimedb():
|
||||
"""Initialize the local MIME database."""
|
||||
global mimedb
|
||||
try:
|
||||
mimedb = mimetypes.MimeTypes(strict=False)
|
||||
except StandardError as msg:
|
||||
log.error(LOG_CHECK, "could not initialize MIME database: %s" % msg)
|
||||
return
|
||||
# For Opera bookmark files (opera6.adr)
|
||||
add_mimetype(mimedb, 'text/plain', '.adr')
|
||||
# To recognize PHP files as HTML with content check.
|
||||
add_mimetype(mimedb, 'application/x-httpd-php', '.php')
|
||||
# To recognize WML files
|
||||
add_mimetype(mimedb, 'text/vnd.wap.wml', '.wml')
|
||||
|
||||
|
||||
def add_mimetype(mimedb, mimetype, extension):
|
||||
"""Add or replace a mimetype to be used with the given extension."""
|
||||
# If extension is already a common type, strict=True must be used.
|
||||
strict = extension in mimedb.types_map[True]
|
||||
mimedb.add_type(mimetype, extension, strict=strict)
|
||||
|
||||
|
||||
# if file extension lookup was unsuccessful, look at the content
|
||||
PARSE_CONTENTS = {
|
||||
"text/html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
|
||||
"text/plain+opera": re.compile(r'^Opera Hotlist'),
|
||||
"text/plain+chromium": re.compile(r'^{\s*"checksum":'),
|
||||
"text/plain+linkchecker": re.compile(r'(?i)^# LinkChecker URL list'),
|
||||
"application/xml+sitemapindex": re.compile(r'(?i)<\?xml[^<]+<sitemapindex\s+'),
|
||||
"application/xml+sitemap": re.compile(r'(?i)<\?xml[^<]+<urlset\s+'),
|
||||
}
|
||||
|
||||
def guess_mimetype (filename, read=None):
|
||||
"""Return MIME type of file, or 'application/octet-stream' if it could
|
||||
not be determined."""
|
||||
mime, encoding = None, None
|
||||
if mimedb:
|
||||
mime, encoding = mimedb.guess_type(filename, strict=False)
|
||||
basename = os.path.basename(filename)
|
||||
# Special case for Safari Bookmark files
|
||||
if not mime and basename == 'Bookmarks.plist':
|
||||
return 'application/x-plist+safari'
|
||||
# Special case for Google Chrome Bookmark files.
|
||||
if not mime and basename == 'Bookmarks':
|
||||
mime = 'text/plain'
|
||||
# Some mime types can be differentiated further with content reading.
|
||||
if mime in ("text/plain", "application/xml", "text/xml") and read is not None:
|
||||
read_mime = guess_mimetype_read(read)
|
||||
if read_mime is not None:
|
||||
mime = read_mime
|
||||
if not mime:
|
||||
mime = "application/octet-stream"
|
||||
elif ";" in mime:
|
||||
# split off not needed extension info
|
||||
mime = mime.split(';')[0]
|
||||
return mime.strip().lower()
|
||||
|
||||
|
||||
def guess_mimetype_read(read):
|
||||
"""Try to read some content and do a poor man's file(1)."""
|
||||
mime = None
|
||||
try:
|
||||
data = read()[:70]
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
for cmime, ro in PARSE_CONTENTS.items():
|
||||
if ro.search(data):
|
||||
mime = cmime
|
||||
break
|
||||
return mime
|
||||
|
||||
|
||||
def get_temp_file (mode='r', **kwargs):
|
||||
"""Return tuple (open file object, filename) pointing to a temporary
|
||||
file."""
|
||||
|
|
@ -304,5 +225,3 @@ def is_writable(filename):
|
|||
parentdir = os.path.dirname(filename)
|
||||
return os.path.isdir(parentdir) and os.access(parentdir, os.W_OK)
|
||||
return os.path.isfile(filename) and os.access(filename, os.W_OK)
|
||||
|
||||
init_mimedb()
|
||||
|
|
|
|||
104
linkcheck/mimeutil.py
Normal file
104
linkcheck/mimeutil.py
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
File and path utilities.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import mimetypes
|
||||
|
||||
from . import log
|
||||
from .logconf import LOG_CHECK
|
||||
|
||||
mimedb = None
|
||||
|
||||
def init_mimedb():
|
||||
"""Initialize the local MIME database."""
|
||||
global mimedb
|
||||
try:
|
||||
mimedb = mimetypes.MimeTypes(strict=False)
|
||||
except StandardError as msg:
|
||||
log.error(LOG_CHECK, "could not initialize MIME database: %s" % msg)
|
||||
return
|
||||
# For Opera bookmark files (opera6.adr)
|
||||
add_mimetype(mimedb, 'text/plain', '.adr')
|
||||
# To recognize PHP files as HTML with content check.
|
||||
add_mimetype(mimedb, 'application/x-httpd-php', '.php')
|
||||
# To recognize WML files
|
||||
add_mimetype(mimedb, 'text/vnd.wap.wml', '.wml')
|
||||
|
||||
|
||||
def add_mimetype(mimedb, mimetype, extension):
|
||||
"""Add or replace a mimetype to be used with the given extension."""
|
||||
# If extension is already a common type, strict=True must be used.
|
||||
strict = extension in mimedb.types_map[True]
|
||||
mimedb.add_type(mimetype, extension, strict=strict)
|
||||
|
||||
|
||||
# if file extension lookup was unsuccessful, look at the content
|
||||
PARSE_CONTENTS = {
|
||||
"text/html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
|
||||
"text/plain+opera": re.compile(r'^Opera Hotlist'),
|
||||
"text/plain+chromium": re.compile(r'^{\s*"checksum":'),
|
||||
"text/plain+linkchecker": re.compile(r'(?i)^# LinkChecker URL list'),
|
||||
"application/xml+sitemapindex": re.compile(r'(?i)<\?xml[^<]+<sitemapindex\s+'),
|
||||
"application/xml+sitemap": re.compile(r'(?i)<\?xml[^<]+<urlset\s+'),
|
||||
}
|
||||
|
||||
def guess_mimetype (filename, read=None):
|
||||
"""Return MIME type of file, or 'application/octet-stream' if it could
|
||||
not be determined."""
|
||||
mime, encoding = None, None
|
||||
if mimedb:
|
||||
mime, encoding = mimedb.guess_type(filename, strict=False)
|
||||
basename = os.path.basename(filename)
|
||||
# Special case for Safari Bookmark files
|
||||
if not mime and basename == 'Bookmarks.plist':
|
||||
return 'application/x-plist+safari'
|
||||
# Special case for Google Chrome Bookmark files.
|
||||
if not mime and basename == 'Bookmarks':
|
||||
mime = 'text/plain'
|
||||
# Some mime types can be differentiated further with content reading.
|
||||
if mime in ("text/plain", "application/xml", "text/xml") and read is not None:
|
||||
read_mime = guess_mimetype_read(read)
|
||||
if read_mime is not None:
|
||||
mime = read_mime
|
||||
if not mime:
|
||||
mime = "application/octet-stream"
|
||||
elif ";" in mime:
|
||||
# split off not needed extension info
|
||||
mime = mime.split(';')[0]
|
||||
return mime.strip().lower()
|
||||
|
||||
|
||||
def guess_mimetype_read(read):
|
||||
"""Try to read some content and do a poor man's file(1)."""
|
||||
mime = None
|
||||
try:
|
||||
data = read()[:70]
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
for cmime, ro in PARSE_CONTENTS.items():
|
||||
if ro.search(data):
|
||||
mime = cmime
|
||||
break
|
||||
return mime
|
||||
|
||||
|
||||
init_mimedb()
|
||||
|
|
@ -19,8 +19,6 @@ Test file utility functions.
|
|||
"""
|
||||
|
||||
import unittest
|
||||
import os
|
||||
from . import get_file
|
||||
import linkcheck.fileutil
|
||||
|
||||
file_existing = __file__
|
||||
|
|
@ -37,18 +35,3 @@ class TestFileutil (unittest.TestCase):
|
|||
def test_mtime (self):
|
||||
self.assertTrue(linkcheck.fileutil.get_mtime(file_existing) > 0)
|
||||
self.assertEqual(linkcheck.fileutil.get_mtime(file_non_existing), 0)
|
||||
|
||||
def mime_test (self, filename, mime_expected):
|
||||
absfilename = get_file(filename)
|
||||
with open(absfilename) as fd:
|
||||
mime = linkcheck.fileutil.guess_mimetype(absfilename, read=fd.read)
|
||||
self.assertEqual(mime, mime_expected)
|
||||
|
||||
def test_mime (self):
|
||||
filename = os.path.join("plist_binary", "Bookmarks.plist")
|
||||
self.mime_test(filename, "application/x-plist+safari")
|
||||
filename = os.path.join("plist_xml", "Bookmarks.plist")
|
||||
self.mime_test(filename, "application/x-plist+safari")
|
||||
self.mime_test("file.wml", "text/vnd.wap.wml")
|
||||
self.mime_test("sitemap.xml", "application/xml+sitemap")
|
||||
self.mime_test("sitemapindex.xml", "application/xml+sitemapindex")
|
||||
|
|
|
|||
42
tests/test_mimeutil.py
Normal file
42
tests/test_mimeutil.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Test mime utility functions.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
import os
|
||||
from . import get_file
|
||||
import linkcheck.mimeutil
|
||||
|
||||
class TestMiMeutil (unittest.TestCase):
|
||||
"""Test file utility functions."""
|
||||
|
||||
def mime_test (self, filename, mime_expected):
|
||||
absfilename = get_file(filename)
|
||||
with open(absfilename) as fd:
|
||||
mime = linkcheck.mimeutil.guess_mimetype(absfilename, read=fd.read)
|
||||
self.assertEqual(mime, mime_expected)
|
||||
|
||||
def test_mime (self):
|
||||
filename = os.path.join("plist_binary", "Bookmarks.plist")
|
||||
self.mime_test(filename, "application/x-plist+safari")
|
||||
filename = os.path.join("plist_xml", "Bookmarks.plist")
|
||||
self.mime_test(filename, "application/x-plist+safari")
|
||||
self.mime_test("file.wml", "text/vnd.wap.wml")
|
||||
self.mime_test("sitemap.xml", "application/xml+sitemap")
|
||||
self.mime_test("sitemapindex.xml", "application/xml+sitemapindex")
|
||||
Loading…
Reference in a new issue