Don't use encoding detection since it's very slow.

This commit is contained in:
Bastian Kleineidam 2014-03-27 12:27:11 +01:00
parent 8cf84be2e2
commit fa26876f67
3 changed files with 101 additions and 120 deletions

View file

@ -1,113 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Helper functions dealing with HTTP headers.
"""
DEFAULT_KEEPALIVE = 300
MAX_HEADER_BYTES = 8*1024
def has_header_value (headers, name, value):
"""
Look in headers for a specific header name and value.
Both name and value are case insensitive.
@return: True if header name and value are found
@rtype: bool
"""
name = name.lower()
value = value.lower()
for hname, hvalue in headers:
if hname.lower()==name and hvalue.lower()==value:
return True
return False
def http_persistent (response):
"""
See if the HTTP connection can be kept open according the the
header values found in the response object.
@param response: response instance
@type response: httplib.HTTPResponse
@return: True if connection is persistent
@rtype: bool
"""
headers = response.getheaders()
if response.version == 11:
return not has_header_value(headers, 'Connection', 'Close')
return has_header_value(headers, "Connection", "Keep-Alive")
def http_keepalive (headers):
"""
Get HTTP keepalive value, either from the Keep-Alive header or a
default value.
@param headers: HTTP headers
@type headers: dict
@return: keepalive in seconds
@rtype: int
"""
keepalive = headers.get("Keep-Alive")
if keepalive is not None:
try:
keepalive = int(keepalive[8:].strip())
except (ValueError, OverflowError):
keepalive = DEFAULT_KEEPALIVE
else:
keepalive = DEFAULT_KEEPALIVE
return keepalive
def get_content_type (headers):
"""
Get the MIME type from the Content-Type header value, or
'application/octet-stream' if not found.
@return: MIME type
@rtype: string
"""
ptype = headers.get('Content-Type', 'application/octet-stream')
if ";" in ptype:
# split off not needed extension info
ptype = ptype.split(';')[0]
return ptype.strip().lower()
def get_charset(headers):
"""
Get the charset encoding from the Content-Type header value, or
None if not found.
@return: the content charset encoding
@rtype: string or None
"""
from linkcheck.HtmlParser import get_ctype_charset
return get_ctype_charset(headers.get('Content-Type', ''))
def get_content_encoding (headers):
"""
Get the content encoding from the Content-Encoding header value, or
an empty string if not found.
@return: encoding string
@rtype: string
"""
return headers.get("Content-Encoding", "").strip()

View file

@ -22,11 +22,10 @@ import requests
from cStringIO import StringIO
from .. import (log, LOG_CHECK, strformat, fileutil,
url as urlutil, LinkCheckerError)
from . import (internpaturl, proxysupport, httpheaders as headers)
url as urlutil, LinkCheckerError, httputil)
from . import (internpaturl, proxysupport)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from ..httputil import x509_to_dict
# import warnings
from .const import WARN_HTTP_EMPTY_CONTENT
from requests.sessions import REDIRECT_STATI
@ -182,7 +181,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def _add_response_info(self):
"""Set info from established HTTP(S) connection."""
self.charset = self.url_connection.apparent_encoding
self.charset = httputil.get_charset(self.headers)
self.set_content_type()
self.add_size_info()
@ -205,7 +204,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
else:
# using pyopenssl
cert = sock.connection.get_peer_certificate()
self.ssl_cert = x509_to_dict(cert)
self.ssl_cert = httputil.x509_to_dict(cert)
log.debug(LOG_CHECK, "Got SSL certificate %s", self.ssl_cert)
else:
self.ssl_cert = None
@ -222,7 +221,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def set_content_type (self):
"""Return content MIME type or empty string."""
self.content_type = headers.get_content_type(self.headers)
self.content_type = httputil.get_content_type(self.headers)
def is_redirect(self):
"""Check if current response is a redirect."""

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2014 Bastian Kleineidam
# Copyright (C) 2005-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -18,6 +18,11 @@ import base64
from datetime import datetime
DEFAULT_KEEPALIVE = 300
MAX_HEADER_BYTES = 8*1024
def encode_base64 (s):
"""Encode given string in base64, excluding trailing newlines."""
return base64.b64encode(s)
@ -67,3 +72,93 @@ def asn1_generaltime_to_seconds(timestr):
except ValueError:
pass
return res
def has_header_value (headers, name, value):
"""
Look in headers for a specific header name and value.
Both name and value are case insensitive.
@return: True if header name and value are found
@rtype: bool
"""
name = name.lower()
value = value.lower()
for hname, hvalue in headers:
if hname.lower()==name and hvalue.lower()==value:
return True
return False
def http_persistent (response):
"""
See if the HTTP connection can be kept open according the the
header values found in the response object.
@param response: response instance
@type response: httplib.HTTPResponse
@return: True if connection is persistent
@rtype: bool
"""
headers = response.getheaders()
if response.version == 11:
return not has_header_value(headers, 'Connection', 'Close')
return has_header_value(headers, "Connection", "Keep-Alive")
def http_keepalive (headers):
"""
Get HTTP keepalive value, either from the Keep-Alive header or a
default value.
@param headers: HTTP headers
@type headers: dict
@return: keepalive in seconds
@rtype: int
"""
keepalive = headers.get("Keep-Alive")
if keepalive is not None:
try:
keepalive = int(keepalive[8:].strip())
except (ValueError, OverflowError):
keepalive = DEFAULT_KEEPALIVE
else:
keepalive = DEFAULT_KEEPALIVE
return keepalive
def get_content_type (headers):
"""
Get the MIME type from the Content-Type header value, or
'application/octet-stream' if not found.
@return: MIME type
@rtype: string
"""
ptype = headers.get('Content-Type', 'application/octet-stream')
if ";" in ptype:
# split off not needed extension info
ptype = ptype.split(';')[0]
return ptype.strip().lower()
def get_charset(headers):
"""
Get the charset encoding from the Content-Type header value, or
None if not found.
@return: the content charset encoding
@rtype: string or None
"""
from linkcheck.HtmlParser import get_ctype_charset
return get_ctype_charset(headers.get('Content-Type', ''))
def get_content_encoding (headers):
"""
Get the content encoding from the Content-Encoding header value, or
an empty string if not found.
@return: encoding string
@rtype: string
"""
return headers.get("Content-Encoding", "").strip()