mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-15 03:41:01 +00:00
split off header methods in separate file
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2381 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
ca49e04730
commit
b3c3c3eaaa
2 changed files with 110 additions and 87 deletions
100
linkcheck/checker/httpheaders.py
Normal file
100
linkcheck/checker/httpheaders.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
Helper functions dealing with HTTP headers.
|
||||
"""
|
||||
|
||||
DEFAULT_TIMEOUT_SECS = 300
|
||||
|
||||
def has_header_value (headers, name, value):
|
||||
"""
|
||||
Look in headers for a specific header name and value.
|
||||
Both name and value are case insensitive.
|
||||
|
||||
@return: True if header name and value are found
|
||||
@rtype: bool
|
||||
"""
|
||||
name = name.lower()
|
||||
value = value.lower()
|
||||
for hname, hvalue in headers:
|
||||
if hname.lower()==name and hvalue.lower()==value:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def http_persistent (response):
|
||||
"""
|
||||
See if the HTTP connection can be kept open according the the
|
||||
header values found in the response object.
|
||||
|
||||
@param response: response instance
|
||||
@type response: httplib.HTTPResponse
|
||||
@return: True if connection is persistent
|
||||
@rtype: bool
|
||||
"""
|
||||
headers = response.getheaders()
|
||||
if response.version == 11:
|
||||
return has_header_value(headers, 'Connection', 'Close')
|
||||
return has_header_value(headers, "Connection", "Keep-Alive")
|
||||
|
||||
|
||||
def http_timeout (response):
|
||||
"""
|
||||
Get HTTP timeout value, either from the Keep-Alive header or a
|
||||
default value.
|
||||
|
||||
@param response: response instance
|
||||
@type response: httplib.HTTPResponse
|
||||
@return: timeout
|
||||
@rtype: int
|
||||
"""
|
||||
timeout = response.getheader("Keep-Alive")
|
||||
if timeout is not None:
|
||||
try:
|
||||
timeout = int(timeout[8:].strip())
|
||||
except ValueError, msg:
|
||||
timeout = DEFAULT_TIMEOUT_SECS
|
||||
else:
|
||||
timeout = DEFAULT_TIMEOUT_SECS
|
||||
return timeout
|
||||
|
||||
|
||||
def get_content_type (headers):
|
||||
"""
|
||||
Get the MIME type from the Content-Type header value, or
|
||||
'application/octet-stream' if not found.
|
||||
|
||||
@return: MIME type
|
||||
@rtype: string
|
||||
"""
|
||||
ptype = headers.get('Content-Type', 'application/octet-stream')
|
||||
if ";" in ptype:
|
||||
# split off not needed extension info
|
||||
ptype = ptype.split(';')[0]
|
||||
return ptype.strip()
|
||||
|
||||
|
||||
def get_content_encoding (headers):
|
||||
"""
|
||||
Get the content encoding from the Content-Encoding header value, or
|
||||
an empty string if not found.
|
||||
|
||||
@return: encoding string
|
||||
@rtype: string
|
||||
"""
|
||||
return headers.get("Content-Encoding", "").strip()
|
||||
|
||||
|
|
@ -32,6 +32,7 @@ import linkcheck.url
|
|||
import linkcheck.strformat
|
||||
import linkcheck.robotparser2
|
||||
import linkcheck.httplib2
|
||||
import linkcheck.checker.httpheaders as headers
|
||||
import urlbase
|
||||
import proxysupport
|
||||
|
||||
|
|
@ -43,61 +44,6 @@ _supported_encodings = ('gzip', 'x-gzip', 'deflate')
|
|||
# Amazon blocks all HEAD requests
|
||||
_is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
|
||||
|
||||
DEFAULT_TIMEOUT_SECS = 300
|
||||
|
||||
|
||||
def has_header_value (headers, name, value):
|
||||
"""
|
||||
Look in headers for a specific header name and value.
|
||||
Both name and value are case insensitive.
|
||||
|
||||
@return: True if header name and value are found
|
||||
@rtype: bool
|
||||
"""
|
||||
name = name.lower()
|
||||
value = value.lower()
|
||||
for hname, hvalue in headers:
|
||||
if hname.lower()==name and hvalue.lower()==value:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def http_persistent (response):
|
||||
"""
|
||||
See if the HTTP connection can be kept open according the the
|
||||
header values found in the response object.
|
||||
|
||||
@param response: response instance
|
||||
@type response: httplib.HTTPResponse
|
||||
@return: True if connection is persistent
|
||||
@rtype: bool
|
||||
"""
|
||||
headers = response.getheaders()
|
||||
if response.version == 11:
|
||||
return has_header_value(headers, 'Connection', 'Close')
|
||||
return has_header_value(headers, "Connection", "Keep-Alive")
|
||||
|
||||
|
||||
def http_timeout (response):
|
||||
"""
|
||||
Get HTTP timeout value, either from the Keep-Alive header or a
|
||||
default value.
|
||||
|
||||
@param response: response instance
|
||||
@type response: httplib.HTTPResponse
|
||||
@return: timeout
|
||||
@rtype: int
|
||||
"""
|
||||
timeout = response.getheader("Keep-Alive")
|
||||
if timeout is not None:
|
||||
try:
|
||||
timeout = int(timeout[8:].strip())
|
||||
except ValueError, msg:
|
||||
timeout = DEFAULT_TIMEOUT_SECS
|
||||
else:
|
||||
timeout = DEFAULT_TIMEOUT_SECS
|
||||
return timeout
|
||||
|
||||
|
||||
class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
||||
"""
|
||||
|
|
@ -486,8 +432,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.url_connection.putheader("Cookie", c)
|
||||
self.url_connection.endheaders()
|
||||
response = self.url_connection.getresponse()
|
||||
self.persistent = http_persistent(response)
|
||||
self.timeout = http_timeout(response)
|
||||
self.persistent = headers.http_persistent(response)
|
||||
self.timeout = headers.http_timeout(response)
|
||||
return response
|
||||
|
||||
def get_http_object (self, host, scheme):
|
||||
|
|
@ -535,7 +481,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
response = self._get_http_response()
|
||||
self.headers = response.msg
|
||||
self.data = response.read()
|
||||
encoding = self.get_content_encoding()
|
||||
encoding = headers.get_content_encoding(self.headers)
|
||||
if encoding in _supported_encodings:
|
||||
try:
|
||||
if encoding == 'deflate':
|
||||
|
|
@ -561,9 +507,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
"""
|
||||
if not (self.valid and self.headers):
|
||||
return False
|
||||
if self.headers.gettype()[:9] != "text/html":
|
||||
if headers.get_content_type(self.headers) != "text/html":
|
||||
return False
|
||||
encoding = self.get_content_encoding()
|
||||
encoding = headers.get_content_encoding(self.headers)
|
||||
if encoding and encoding not in _supported_encodings and \
|
||||
encoding != 'identity':
|
||||
self.add_warning(_('Unsupported content encoding %r.') % encoding)
|
||||
|
|
@ -579,30 +525,6 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
"""
|
||||
return True
|
||||
|
||||
def get_content_type (self):
|
||||
"""
|
||||
Get the MIME type from the Content-Type header value, or
|
||||
'application/octet-stream' if not found.
|
||||
|
||||
@return: MIME type
|
||||
@rtype: string
|
||||
"""
|
||||
ptype = self.headers.get('Content-Type', 'application/octet-stream')
|
||||
if ";" in ptype:
|
||||
# split off not needed extension info
|
||||
ptype = ptype.split(';')[0]
|
||||
return ptype.strip()
|
||||
|
||||
def get_content_encoding (self):
|
||||
"""
|
||||
Get the content encoding from the Content-Encoding header value, or
|
||||
an empty string if not found.
|
||||
|
||||
@return: encoding string
|
||||
@rtype: string
|
||||
"""
|
||||
return self.headers.get("Content-Encoding", "").strip()
|
||||
|
||||
def is_parseable (self):
|
||||
"""
|
||||
Check if content is parseable for recursion.
|
||||
|
|
@ -612,9 +534,10 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
"""
|
||||
if not (self.valid and self.headers):
|
||||
return False
|
||||
if self.get_content_type() not in ("text/html", "text/css"):
|
||||
if headers.get_content_type(self.headers) not in \
|
||||
("text/html", "text/css"):
|
||||
return False
|
||||
encoding = self.get_content_encoding()
|
||||
encoding = headers.get_content_encoding(self.headers)
|
||||
if encoding and encoding not in _supported_encodings and \
|
||||
encoding != 'identity':
|
||||
self.add_warning(_('Unsupported content encoding %r.') % encoding)
|
||||
|
|
@ -625,7 +548,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
"""
|
||||
Parse file contents for new links to check.
|
||||
"""
|
||||
ptype = self.get_content_type()
|
||||
ptype = headers.get_content_type(self.headers)
|
||||
if ptype == "text/html":
|
||||
self.parse_html()
|
||||
elif ptype == "text/css":
|
||||
|
|
|
|||
Loading…
Reference in a new issue