Improve content type setting.

2026-04-15 11:51:02 +00:00 · 2014-03-05 20:12:19 +01:00 · 2014-03-05 20:12:19 +01:00 · fab2c2da98
commit fab2c2da98
parent b96e3d08c1
6 changed files with 28 additions and 40 deletions
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -242,21 +242,18 @@ class FileUrl (urlbase.UrlBase):
            return True
        if firefox.has_sqlite and firefox.extension.search(self.url):
            return True
-        ctype = self.get_content_type()
-        if ctype in self.ContentMimetypes:
+        if self.content_type in self.ContentMimetypes:
            return True
-        log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype)
+        log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type)
        return False

-    def get_content_type (self):
+    def set_content_type (self):
        """Return URL content type, or an empty string if content
        type could not be found."""
-        if self.content_type is None:
-            if self.url:
-                self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
-            else:
-                self.content_type = u""
-        return self.content_type
+        if self.url:
+            self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
+        else:
+            self.content_type = u""

    def get_intern_pattern (self, url=None):
        """Get pattern for intern URL matching.
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@ -165,10 +165,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """See if URL target is parseable for recursion."""
        if self.is_directory():
            return True
-        ctype = self.get_content_type()
-        if ctype in self.ContentMimetypes:
+        if self.content_type in self.ContentMimetypes:
            return True
-        log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype)
+        log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type)
        return False

    def is_directory (self):
@ -177,12 +176,10 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        path = self.urlparts[2]
        return (not path) or path.endswith('/')

-    def get_content_type (self):
-        """Return URL content type, or an empty string if content
+    def set_content_type (self):
+        """Set URL content type, or an empty string if content
        type could not be found."""
-        if self.content_type is None:
-            self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
-        return self.content_type
+        self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)

    def read_content (self):
        """Return URL target content, or in case of directories a dummy HTML
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -153,11 +153,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        if _user is not None and _password is not None:
            self.auth = (_user, _password)

-    def get_content_type (self):
+    def set_content_type (self):
        """Return content MIME type or empty string."""
-        if not self.content_type:
-            self.content_type = headers.get_content_type(self.headers)
-        return self.content_type
+        self.content_type = headers.get_content_type(self.headers)

    def follow_redirections(self, request):
        """Follow all redirections of http response."""
@ -226,17 +224,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """
        if not self.valid:
            return False
-        ctype = self.get_content_type()
        # some content types must be validated with the page content
-        if ctype in ("application/xml", "text/xml"):
-            data = self.get_content()
-            io = StringIO(data)
-            rtype = fileutil.guess_mimetype_read(io.read)
+        if self.content_type in ("application/xml", "text/xml"):
+            rtype = fileutil.guess_mimetype_read(self.get_content)
            if rtype is not None:
                # XXX side effect
-                ctype = self.content_type = rtype
-        if ctype not in self.ContentMimetypes:
-            log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
+                self.content_type = rtype
+        if self.content_type not in self.ContentMimetypes:
+            log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type)
            return False
        return True

--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -193,7 +193,7 @@ class UrlBase (object):
        # flag if content should be checked or not
        self.do_check_content = True
        # MIME content type
-        self.content_type = None
+        self.content_type = u""

    def set_result (self, msg, valid=True, overwrite=False):
        """
@ -247,7 +247,7 @@ class UrlBase (object):
        """Return True iff content is valid and of the given type."""
        if not self.valid:
            return False
-        mime = self.get_content_type()
+        mime = self.content_type
        return self.ContentMimetypes.get(mime) == ctype

    def is_http (self):
@ -429,6 +429,7 @@ class UrlBase (object):
        log.debug(LOG_CHECK, "checking connection")
        try:
            self.check_connection()
+            self.set_content_type()
            self.add_size_info()
            self.aggregate.plugin_manager.run_connection_plugins(self)
        except tuple(ExcList) as exc:
@ -600,12 +601,10 @@ class UrlBase (object):
            if not self.has_result:
                self.set_result(_("filtered"))

-    def get_content_type (self):
-        """Return content MIME type or empty string.
+    def set_content_type (self):
+        """Set content MIME type.
        Should be overridden in subclasses."""
-        if self.content_type is None:
-            self.content_type = u""
-        return self.content_type
+        pass

    def can_get_content (self):
        """Indicate wether url get_content() can be called."""
@ -794,7 +793,7 @@ class UrlBase (object):
          line=self.line,
          column=self.column,
          cache_key=self.cache_key,
-          content_type=self.get_content_type(),
+          content_type=self.content_type,
          level=self.recursion_level,
          modified=self.modified,
        )
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -31,7 +31,7 @@ def parse_url(url_data):
    if url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
        return parse_firefox(url_data)
    # determine parse routine according to content types
-    mime = url_data.get_content_type()
+    mime = url_data.content_type
    key = url_data.ContentMimetypes[mime]
    return globals()["parse_"+key](url_data)

--- a/tests/checker/test_base.py
+++ b/tests/checker/test_base.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2004-2010 Bastian Kleineidam
+# Copyright (C) 2004-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by