Improve content type setting.

This commit is contained in:
Bastian Kleineidam 2014-03-05 20:12:19 +01:00
parent b96e3d08c1
commit fab2c2da98
6 changed files with 28 additions and 40 deletions

View file

@ -242,21 +242,18 @@ class FileUrl (urlbase.UrlBase):
return True
if firefox.has_sqlite and firefox.extension.search(self.url):
return True
ctype = self.get_content_type()
if ctype in self.ContentMimetypes:
if self.content_type in self.ContentMimetypes:
return True
log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype)
log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type)
return False
def get_content_type (self):
def set_content_type (self):
"""Return URL content type, or an empty string if content
type could not be found."""
if self.content_type is None:
if self.url:
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
else:
self.content_type = u""
return self.content_type
if self.url:
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
else:
self.content_type = u""
def get_intern_pattern (self, url=None):
"""Get pattern for intern URL matching.

View file

@ -165,10 +165,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""See if URL target is parseable for recursion."""
if self.is_directory():
return True
ctype = self.get_content_type()
if ctype in self.ContentMimetypes:
if self.content_type in self.ContentMimetypes:
return True
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype)
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type)
return False
def is_directory (self):
@ -177,12 +176,10 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
path = self.urlparts[2]
return (not path) or path.endswith('/')
def get_content_type (self):
"""Return URL content type, or an empty string if content
def set_content_type (self):
"""Set URL content type, or an empty string if content
type could not be found."""
if self.content_type is None:
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
return self.content_type
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
def read_content (self):
"""Return URL target content, or in case of directories a dummy HTML

View file

@ -153,11 +153,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if _user is not None and _password is not None:
self.auth = (_user, _password)
def get_content_type (self):
def set_content_type (self):
"""Return content MIME type or empty string."""
if not self.content_type:
self.content_type = headers.get_content_type(self.headers)
return self.content_type
self.content_type = headers.get_content_type(self.headers)
def follow_redirections(self, request):
"""Follow all redirections of http response."""
@ -226,17 +224,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
if not self.valid:
return False
ctype = self.get_content_type()
# some content types must be validated with the page content
if ctype in ("application/xml", "text/xml"):
data = self.get_content()
io = StringIO(data)
rtype = fileutil.guess_mimetype_read(io.read)
if self.content_type in ("application/xml", "text/xml"):
rtype = fileutil.guess_mimetype_read(self.get_content)
if rtype is not None:
# XXX side effect
ctype = self.content_type = rtype
if ctype not in self.ContentMimetypes:
log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
self.content_type = rtype
if self.content_type not in self.ContentMimetypes:
log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type)
return False
return True

View file

@ -193,7 +193,7 @@ class UrlBase (object):
# flag if content should be checked or not
self.do_check_content = True
# MIME content type
self.content_type = None
self.content_type = u""
def set_result (self, msg, valid=True, overwrite=False):
"""
@ -247,7 +247,7 @@ class UrlBase (object):
"""Return True iff content is valid and of the given type."""
if not self.valid:
return False
mime = self.get_content_type()
mime = self.content_type
return self.ContentMimetypes.get(mime) == ctype
def is_http (self):
@ -429,6 +429,7 @@ class UrlBase (object):
log.debug(LOG_CHECK, "checking connection")
try:
self.check_connection()
self.set_content_type()
self.add_size_info()
self.aggregate.plugin_manager.run_connection_plugins(self)
except tuple(ExcList) as exc:
@ -600,12 +601,10 @@ class UrlBase (object):
if not self.has_result:
self.set_result(_("filtered"))
def get_content_type (self):
"""Return content MIME type or empty string.
def set_content_type (self):
"""Set content MIME type.
Should be overridden in subclasses."""
if self.content_type is None:
self.content_type = u""
return self.content_type
pass
def can_get_content (self):
"""Indicate wether url get_content() can be called."""
@ -794,7 +793,7 @@ class UrlBase (object):
line=self.line,
column=self.column,
cache_key=self.cache_key,
content_type=self.get_content_type(),
content_type=self.content_type,
level=self.recursion_level,
modified=self.modified,
)

View file

@ -31,7 +31,7 @@ def parse_url(url_data):
if url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
return parse_firefox(url_data)
# determine parse routine according to content types
mime = url_data.get_content_type()
mime = url_data.content_type
key = url_data.ContentMimetypes[mime]
return globals()["parse_"+key](url_data)

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2010 Bastian Kleineidam
# Copyright (C) 2004-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by