From 380f14453be48b7b0d7af625fa63d0714bfadaf4 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Wed, 5 Mar 2014 19:23:58 +0100 Subject: [PATCH] Fix mimetype guessing from content. --- linkcheck/fileutil.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/linkcheck/fileutil.py b/linkcheck/fileutil.py index 6e796d29..35d15508 100644 --- a/linkcheck/fileutil.py +++ b/linkcheck/fileutil.py @@ -233,16 +233,11 @@ def guess_mimetype (filename, read=None): # Special case for Google Chrome Bookmark files. if not mime and basename == 'Bookmarks': mime = 'text/plain' - # Mime type text/plain can be differentiated further with content reading. - if mime == "text/plain" and read is not None: - # try to read some content and do a poor man's file(1) - try: - data = read()[:30] - for mime, ro in PARSE_CONTENTS.items(): - if ro.search(data): - break - except Exception: - pass + # Some mime types can be differentiated further with content reading. + if mime in ("text/plain", "application/xml", "text/xml") and read is not None: + read_mime = guess_mimetype_read(read) + if read_mime is not None: + mime = read_mime if not mime: mime = "application/octet-stream" elif ";" in mime: @@ -251,6 +246,21 @@ def guess_mimetype (filename, read=None): return mime.strip().lower() +def guess_mimetype_read(read): + """Try to read some content and do a poor man's file(1).""" + mime = None + try: + data = read()[:70] + except Exception: + pass + else: + for cmime, ro in PARSE_CONTENTS.items(): + if ro.search(data): + mime = cmime + break + return mime + + def get_temp_file (mode='r', **kwargs): """Return tuple (open file object, filename) pointing to a temporary file."""