mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-15 11:51:02 +00:00
Improve content type setting.
This commit is contained in:
parent
b96e3d08c1
commit
fab2c2da98
6 changed files with 28 additions and 40 deletions
|
|
@ -242,21 +242,18 @@ class FileUrl (urlbase.UrlBase):
|
|||
return True
|
||||
if firefox.has_sqlite and firefox.extension.search(self.url):
|
||||
return True
|
||||
ctype = self.get_content_type()
|
||||
if ctype in self.ContentMimetypes:
|
||||
if self.content_type in self.ContentMimetypes:
|
||||
return True
|
||||
log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype)
|
||||
log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type)
|
||||
return False
|
||||
|
||||
def get_content_type (self):
|
||||
def set_content_type (self):
|
||||
"""Return URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
if self.content_type is None:
|
||||
if self.url:
|
||||
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
|
||||
else:
|
||||
self.content_type = u""
|
||||
return self.content_type
|
||||
if self.url:
|
||||
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
|
||||
else:
|
||||
self.content_type = u""
|
||||
|
||||
def get_intern_pattern (self, url=None):
|
||||
"""Get pattern for intern URL matching.
|
||||
|
|
|
|||
|
|
@ -165,10 +165,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""See if URL target is parseable for recursion."""
|
||||
if self.is_directory():
|
||||
return True
|
||||
ctype = self.get_content_type()
|
||||
if ctype in self.ContentMimetypes:
|
||||
if self.content_type in self.ContentMimetypes:
|
||||
return True
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype)
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type)
|
||||
return False
|
||||
|
||||
def is_directory (self):
|
||||
|
|
@ -177,12 +176,10 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
path = self.urlparts[2]
|
||||
return (not path) or path.endswith('/')
|
||||
|
||||
def get_content_type (self):
|
||||
"""Return URL content type, or an empty string if content
|
||||
def set_content_type (self):
|
||||
"""Set URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
if self.content_type is None:
|
||||
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
|
||||
return self.content_type
|
||||
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
|
||||
|
||||
def read_content (self):
|
||||
"""Return URL target content, or in case of directories a dummy HTML
|
||||
|
|
|
|||
|
|
@ -153,11 +153,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
if _user is not None and _password is not None:
|
||||
self.auth = (_user, _password)
|
||||
|
||||
def get_content_type (self):
|
||||
def set_content_type (self):
|
||||
"""Return content MIME type or empty string."""
|
||||
if not self.content_type:
|
||||
self.content_type = headers.get_content_type(self.headers)
|
||||
return self.content_type
|
||||
self.content_type = headers.get_content_type(self.headers)
|
||||
|
||||
def follow_redirections(self, request):
|
||||
"""Follow all redirections of http response."""
|
||||
|
|
@ -226,17 +224,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
if not self.valid:
|
||||
return False
|
||||
ctype = self.get_content_type()
|
||||
# some content types must be validated with the page content
|
||||
if ctype in ("application/xml", "text/xml"):
|
||||
data = self.get_content()
|
||||
io = StringIO(data)
|
||||
rtype = fileutil.guess_mimetype_read(io.read)
|
||||
if self.content_type in ("application/xml", "text/xml"):
|
||||
rtype = fileutil.guess_mimetype_read(self.get_content)
|
||||
if rtype is not None:
|
||||
# XXX side effect
|
||||
ctype = self.content_type = rtype
|
||||
if ctype not in self.ContentMimetypes:
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
|
||||
self.content_type = rtype
|
||||
if self.content_type not in self.ContentMimetypes:
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
|
|
|||
|
|
@ -193,7 +193,7 @@ class UrlBase (object):
|
|||
# flag if content should be checked or not
|
||||
self.do_check_content = True
|
||||
# MIME content type
|
||||
self.content_type = None
|
||||
self.content_type = u""
|
||||
|
||||
def set_result (self, msg, valid=True, overwrite=False):
|
||||
"""
|
||||
|
|
@ -247,7 +247,7 @@ class UrlBase (object):
|
|||
"""Return True iff content is valid and of the given type."""
|
||||
if not self.valid:
|
||||
return False
|
||||
mime = self.get_content_type()
|
||||
mime = self.content_type
|
||||
return self.ContentMimetypes.get(mime) == ctype
|
||||
|
||||
def is_http (self):
|
||||
|
|
@ -429,6 +429,7 @@ class UrlBase (object):
|
|||
log.debug(LOG_CHECK, "checking connection")
|
||||
try:
|
||||
self.check_connection()
|
||||
self.set_content_type()
|
||||
self.add_size_info()
|
||||
self.aggregate.plugin_manager.run_connection_plugins(self)
|
||||
except tuple(ExcList) as exc:
|
||||
|
|
@ -600,12 +601,10 @@ class UrlBase (object):
|
|||
if not self.has_result:
|
||||
self.set_result(_("filtered"))
|
||||
|
||||
def get_content_type (self):
|
||||
"""Return content MIME type or empty string.
|
||||
def set_content_type (self):
|
||||
"""Set content MIME type.
|
||||
Should be overridden in subclasses."""
|
||||
if self.content_type is None:
|
||||
self.content_type = u""
|
||||
return self.content_type
|
||||
pass
|
||||
|
||||
def can_get_content (self):
|
||||
"""Indicate wether url get_content() can be called."""
|
||||
|
|
@ -794,7 +793,7 @@ class UrlBase (object):
|
|||
line=self.line,
|
||||
column=self.column,
|
||||
cache_key=self.cache_key,
|
||||
content_type=self.get_content_type(),
|
||||
content_type=self.content_type,
|
||||
level=self.recursion_level,
|
||||
modified=self.modified,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ def parse_url(url_data):
|
|||
if url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
|
||||
return parse_firefox(url_data)
|
||||
# determine parse routine according to content types
|
||||
mime = url_data.get_content_type()
|
||||
mime = url_data.content_type
|
||||
key = url_data.ContentMimetypes[mime]
|
||||
return globals()["parse_"+key](url_data)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2010 Bastian Kleineidam
|
||||
# Copyright (C) 2004-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
Loading…
Reference in a new issue