From fab2c2da98aa707c5bfe3c24909e274039ea9a7a Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Wed, 5 Mar 2014 20:12:19 +0100 Subject: [PATCH] Improve content type setting. --- linkcheck/checker/fileurl.py | 17 +++++++---------- linkcheck/checker/ftpurl.py | 13 +++++-------- linkcheck/checker/httpurl.py | 19 +++++++------------ linkcheck/checker/urlbase.py | 15 +++++++-------- linkcheck/parser/__init__.py | 2 +- tests/checker/test_base.py | 2 +- 6 files changed, 28 insertions(+), 40 deletions(-) diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index da839958..6c5d3187 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -242,21 +242,18 @@ class FileUrl (urlbase.UrlBase): return True if firefox.has_sqlite and firefox.extension.search(self.url): return True - ctype = self.get_content_type() - if ctype in self.ContentMimetypes: + if self.content_type in self.ContentMimetypes: return True - log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype) + log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type) return False - def get_content_type (self): + def set_content_type (self): """Return URL content type, or an empty string if content type could not be found.""" - if self.content_type is None: - if self.url: - self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content) - else: - self.content_type = u"" - return self.content_type + if self.url: + self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content) + else: + self.content_type = u"" def get_intern_pattern (self, url=None): """Get pattern for intern URL matching. diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index af21b61b..36455463 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -165,10 +165,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """See if URL target is parseable for recursion.""" if self.is_directory(): return True - ctype = self.get_content_type() - if ctype in self.ContentMimetypes: + if self.content_type in self.ContentMimetypes: return True - log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype) + log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type) return False def is_directory (self): @@ -177,12 +176,10 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): path = self.urlparts[2] return (not path) or path.endswith('/') - def get_content_type (self): - """Return URL content type, or an empty string if content + def set_content_type (self): + """Set URL content type, or an empty string if content type could not be found.""" - if self.content_type is None: - self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content) - return self.content_type + self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content) def read_content (self): """Return URL target content, or in case of directories a dummy HTML diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index a38a9640..9d1122b8 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -153,11 +153,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): if _user is not None and _password is not None: self.auth = (_user, _password) - def get_content_type (self): + def set_content_type (self): """Return content MIME type or empty string.""" - if not self.content_type: - self.content_type = headers.get_content_type(self.headers) - return self.content_type + self.content_type = headers.get_content_type(self.headers) def follow_redirections(self, request): """Follow all redirections of http response.""" @@ -226,17 +224,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ if not self.valid: return False - ctype = self.get_content_type() # some content types must be validated with the page content - if ctype in ("application/xml", "text/xml"): - data = self.get_content() - io = StringIO(data) - rtype = fileutil.guess_mimetype_read(io.read) + if self.content_type in ("application/xml", "text/xml"): + rtype = fileutil.guess_mimetype_read(self.get_content) if rtype is not None: # XXX side effect - ctype = self.content_type = rtype - if ctype not in self.ContentMimetypes: - log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype) + self.content_type = rtype + if self.content_type not in self.ContentMimetypes: + log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type) return False return True diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index fe2a7399..fb8d8b28 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -193,7 +193,7 @@ class UrlBase (object): # flag if content should be checked or not self.do_check_content = True # MIME content type - self.content_type = None + self.content_type = u"" def set_result (self, msg, valid=True, overwrite=False): """ @@ -247,7 +247,7 @@ class UrlBase (object): """Return True iff content is valid and of the given type.""" if not self.valid: return False - mime = self.get_content_type() + mime = self.content_type return self.ContentMimetypes.get(mime) == ctype def is_http (self): @@ -429,6 +429,7 @@ class UrlBase (object): log.debug(LOG_CHECK, "checking connection") try: self.check_connection() + self.set_content_type() self.add_size_info() self.aggregate.plugin_manager.run_connection_plugins(self) except tuple(ExcList) as exc: @@ -600,12 +601,10 @@ class UrlBase (object): if not self.has_result: self.set_result(_("filtered")) - def get_content_type (self): - """Return content MIME type or empty string. + def set_content_type (self): + """Set content MIME type. Should be overridden in subclasses.""" - if self.content_type is None: - self.content_type = u"" - return self.content_type + pass def can_get_content (self): """Indicate wether url get_content() can be called.""" @@ -794,7 +793,7 @@ class UrlBase (object): line=self.line, column=self.column, cache_key=self.cache_key, - content_type=self.get_content_type(), + content_type=self.content_type, level=self.recursion_level, modified=self.modified, ) diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index 39575c34..fff2a9c2 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -31,7 +31,7 @@ def parse_url(url_data): if url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url): return parse_firefox(url_data) # determine parse routine according to content types - mime = url_data.get_content_type() + mime = url_data.content_type key = url_data.ContentMimetypes[mime] return globals()["parse_"+key](url_data) diff --git a/tests/checker/test_base.py b/tests/checker/test_base.py index 0020c7ef..7e9f75ad 100644 --- a/tests/checker/test_base.py +++ b/tests/checker/test_base.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -# Copyright (C) 2004-2010 Bastian Kleineidam +# Copyright (C) 2004-2014 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by