From fd3fe8dcaae5f330b090b30fc994f10d406c8a15 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Thu, 23 Dec 2010 07:37:36 +0100 Subject: [PATCH] Fix missing content types for cached URLs. --- doc/changelog.txt | 1 + linkcheck/checker/fileurl.py | 9 ++++++--- linkcheck/checker/ftpurl.py | 4 +++- linkcheck/checker/httpurl.py | 9 ++++++--- linkcheck/checker/urlbase.py | 8 +++++++- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index 91d55731..0b097fc7 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -7,6 +7,7 @@ Fixes: internal links when given as start URL. - logging: Allow Unicode strings to be written to stdout without encoding errors on Unix systems. +- logging: Fix missing content type for cached URLs. - gui: Reset statistics before each run. Changes: diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index c719e951..5a19c651 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -255,9 +255,12 @@ class FileUrl (urlbase.UrlBase): self.aggregate.urlqueue.put(url_data) def get_content_type (self): - if self.url: - return fileutil.guess_mimetype(self.url, read=self.get_content) - return u"" + if self.content_type is None: + if self.url: + self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content) + else: + self.content_type = u"" + return self.content_type def get_intern_pattern (self): """ diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index b99d8a2b..2034939d 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -206,7 +206,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): getattr(self, "parse_"+key)() def get_content_type (self, read=None): - return fileutil.guess_mimetype(self.url, read=read) + if self.content_type is None: + self.content_type = fileutil.guess_mimetype(self.url, read=read) + return self.content_type def read_content (self): """Return URL target content, or in case of directories a dummy HTML diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 3fa09182..9e9b078f 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -284,9 +284,12 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): def get_content_type (self): """Return content MIME type or empty string.""" - if self.headers: - return headers.get_content_type(self.headers) - return u"" + if self.content_type is None: + if self.headers: + self.content_type = headers.get_content_type(self.headers) + else: + self.content_type = u"" + return self.content_type def follow_redirections (self, response, set_result=True): """ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index e97fcdff..c62005f4 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -179,6 +179,8 @@ class UrlBase (object): self.title = None # flag if content should be checked or not self.do_check_content = True + # MIME content type + self.content_type = None def set_result (self, msg, valid=True, overwrite=False): """ @@ -295,6 +297,7 @@ class UrlBase (object): self.dltime = cache_data["dltime"] self.dlsize = cache_data["dlsize"] self.anchors = cache_data["anchors"] + self.content_type = cache_data["content_type"] self.cached = True if anchor_changed and self.valid and self.anchor: # recheck anchor @@ -312,6 +315,7 @@ class UrlBase (object): "dlsize": self.dlsize, "anchors": self.anchors, "anchor": self.anchor, + "content_type": self.get_content_type(), } def get_alias_cache_data (self): @@ -654,7 +658,9 @@ class UrlBase (object): def get_content_type (self): """Return content MIME type or empty string. Should be overridden in subclasses.""" - return u"" + if self.content_type is None: + self.content_type = u"" + return self.content_type def can_get_content (self): """Indicate wether url get_content() can be called."""