Fix missing content types for cached URLs.

This commit is contained in:
Bastian Kleineidam 2010-12-23 07:37:36 +01:00
parent e8d5bbe4be
commit fd3fe8dcaa
5 changed files with 23 additions and 8 deletions

View file

@ -7,6 +7,7 @@ Fixes:
internal links when given as start URL.
- logging: Allow Unicode strings to be written to stdout without
encoding errors on Unix systems.
- logging: Fix missing content type for cached URLs.
- gui: Reset statistics before each run.
Changes:

View file

@ -255,9 +255,12 @@ class FileUrl (urlbase.UrlBase):
self.aggregate.urlqueue.put(url_data)
def get_content_type (self):
if self.url:
return fileutil.guess_mimetype(self.url, read=self.get_content)
return u""
if self.content_type is None:
if self.url:
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
else:
self.content_type = u""
return self.content_type
def get_intern_pattern (self):
"""

View file

@ -206,7 +206,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
getattr(self, "parse_"+key)()
def get_content_type (self, read=None):
return fileutil.guess_mimetype(self.url, read=read)
if self.content_type is None:
self.content_type = fileutil.guess_mimetype(self.url, read=read)
return self.content_type
def read_content (self):
"""Return URL target content, or in case of directories a dummy HTML

View file

@ -284,9 +284,12 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def get_content_type (self):
"""Return content MIME type or empty string."""
if self.headers:
return headers.get_content_type(self.headers)
return u""
if self.content_type is None:
if self.headers:
self.content_type = headers.get_content_type(self.headers)
else:
self.content_type = u""
return self.content_type
def follow_redirections (self, response, set_result=True):
"""

View file

@ -179,6 +179,8 @@ class UrlBase (object):
self.title = None
# flag if content should be checked or not
self.do_check_content = True
# MIME content type
self.content_type = None
def set_result (self, msg, valid=True, overwrite=False):
"""
@ -295,6 +297,7 @@ class UrlBase (object):
self.dltime = cache_data["dltime"]
self.dlsize = cache_data["dlsize"]
self.anchors = cache_data["anchors"]
self.content_type = cache_data["content_type"]
self.cached = True
if anchor_changed and self.valid and self.anchor:
# recheck anchor
@ -312,6 +315,7 @@ class UrlBase (object):
"dlsize": self.dlsize,
"anchors": self.anchors,
"anchor": self.anchor,
"content_type": self.get_content_type(),
}
def get_alias_cache_data (self):
@ -654,7 +658,9 @@ class UrlBase (object):
def get_content_type (self):
"""Return content MIME type or empty string.
Should be overridden in subclasses."""
return u""
if self.content_type is None:
self.content_type = u""
return self.content_type
def can_get_content (self):
"""Indicate wether url get_content() can be called."""