From fab2c2da98aa707c5bfe3c24909e274039ea9a7a Mon Sep 17 00:00:00 2001
From: Bastian Kleineidam <bastian.kleineidam@web.de>
Date: Wed, 5 Mar 2014 20:12:19 +0100
Subject: [PATCH] Improve content type setting.

---
 linkcheck/checker/fileurl.py | 17 +++++++----------
 linkcheck/checker/ftpurl.py  | 13 +++++--------
 linkcheck/checker/httpurl.py | 19 +++++++------------
 linkcheck/checker/urlbase.py | 15 +++++++--------
 linkcheck/parser/__init__.py |  2 +-
 tests/checker/test_base.py   |  2 +-
 6 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py
index da839958..6c5d3187 100644
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@@ -242,21 +242,18 @@ class FileUrl (urlbase.UrlBase):
             return True
         if firefox.has_sqlite and firefox.extension.search(self.url):
             return True
-        ctype = self.get_content_type()
-        if ctype in self.ContentMimetypes:
+        if self.content_type in self.ContentMimetypes:
             return True
-        log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype)
+        log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type)
         return False
 
-    def get_content_type (self):
+    def set_content_type (self):
         """Return URL content type, or an empty string if content
         type could not be found."""
-        if self.content_type is None:
-            if self.url:
-                self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
-            else:
-                self.content_type = u""
-        return self.content_type
+        if self.url:
+            self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
+        else:
+            self.content_type = u""
 
     def get_intern_pattern (self, url=None):
         """Get pattern for intern URL matching.
diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py
index af21b61b..36455463 100644
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@@ -165,10 +165,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         """See if URL target is parseable for recursion."""
         if self.is_directory():
             return True
-        ctype = self.get_content_type()
-        if ctype in self.ContentMimetypes:
+        if self.content_type in self.ContentMimetypes:
             return True
-        log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype)
+        log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type)
         return False
 
     def is_directory (self):
@@ -177,12 +176,10 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         path = self.urlparts[2]
         return (not path) or path.endswith('/')
 
-    def get_content_type (self):
-        """Return URL content type, or an empty string if content
+    def set_content_type (self):
+        """Set URL content type, or an empty string if content
         type could not be found."""
-        if self.content_type is None:
-            self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
-        return self.content_type
+        self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
 
     def read_content (self):
         """Return URL target content, or in case of directories a dummy HTML
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index a38a9640..9d1122b8 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -153,11 +153,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         if _user is not None and _password is not None:
             self.auth = (_user, _password)
 
-    def get_content_type (self):
+    def set_content_type (self):
         """Return content MIME type or empty string."""
-        if not self.content_type:
-            self.content_type = headers.get_content_type(self.headers)
-        return self.content_type
+        self.content_type = headers.get_content_type(self.headers)
 
     def follow_redirections(self, request):
         """Follow all redirections of http response."""
@@ -226,17 +224,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         """
         if not self.valid:
             return False
-        ctype = self.get_content_type()
         # some content types must be validated with the page content
-        if ctype in ("application/xml", "text/xml"):
-            data = self.get_content()
-            io = StringIO(data)
-            rtype = fileutil.guess_mimetype_read(io.read)
+        if self.content_type in ("application/xml", "text/xml"):
+            rtype = fileutil.guess_mimetype_read(self.get_content)
             if rtype is not None:
                 # XXX side effect
-                ctype = self.content_type = rtype
-        if ctype not in self.ContentMimetypes:
-            log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
+                self.content_type = rtype
+        if self.content_type not in self.ContentMimetypes:
+            log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type)
             return False
         return True
 
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index fe2a7399..fb8d8b28 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -193,7 +193,7 @@ class UrlBase (object):
         # flag if content should be checked or not
         self.do_check_content = True
         # MIME content type
-        self.content_type = None
+        self.content_type = u""
 
     def set_result (self, msg, valid=True, overwrite=False):
         """
@@ -247,7 +247,7 @@ class UrlBase (object):
         """Return True iff content is valid and of the given type."""
         if not self.valid:
             return False
-        mime = self.get_content_type()
+        mime = self.content_type
         return self.ContentMimetypes.get(mime) == ctype
 
     def is_http (self):
@@ -429,6 +429,7 @@ class UrlBase (object):
         log.debug(LOG_CHECK, "checking connection")
         try:
             self.check_connection()
+            self.set_content_type()
             self.add_size_info()
             self.aggregate.plugin_manager.run_connection_plugins(self)
         except tuple(ExcList) as exc:
@@ -600,12 +601,10 @@ class UrlBase (object):
             if not self.has_result:
                 self.set_result(_("filtered"))
 
-    def get_content_type (self):
-        """Return content MIME type or empty string.
+    def set_content_type (self):
+        """Set content MIME type.
         Should be overridden in subclasses."""
-        if self.content_type is None:
-            self.content_type = u""
-        return self.content_type
+        pass
 
     def can_get_content (self):
         """Indicate wether url get_content() can be called."""
@@ -794,7 +793,7 @@ class UrlBase (object):
           line=self.line,
           column=self.column,
           cache_key=self.cache_key,
-          content_type=self.get_content_type(),
+          content_type=self.content_type,
           level=self.recursion_level,
           modified=self.modified,
         )
diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py
index 39575c34..fff2a9c2 100644
--- a/linkcheck/parser/__init__.py
+++ b/linkcheck/parser/__init__.py
@@ -31,7 +31,7 @@ def parse_url(url_data):
     if url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
         return parse_firefox(url_data)
     # determine parse routine according to content types
-    mime = url_data.get_content_type()
+    mime = url_data.content_type
     key = url_data.ContentMimetypes[mime]
     return globals()["parse_"+key](url_data)
 
diff --git a/tests/checker/test_base.py b/tests/checker/test_base.py
index 0020c7ef..7e9f75ad 100644
--- a/tests/checker/test_base.py
+++ b/tests/checker/test_base.py
@@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2004-2010 Bastian Kleineidam
+# Copyright (C) 2004-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by