mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-22 07:04:44 +00:00
Don't use encoding detection since it's very slow.
This commit is contained in:
parent
8cf84be2e2
commit
fa26876f67
3 changed files with 101 additions and 120 deletions
|
|
@ -1,113 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Helper functions dealing with HTTP headers.
|
||||
"""
|
||||
|
||||
DEFAULT_KEEPALIVE = 300
|
||||
|
||||
MAX_HEADER_BYTES = 8*1024
|
||||
|
||||
def has_header_value (headers, name, value):
|
||||
"""
|
||||
Look in headers for a specific header name and value.
|
||||
Both name and value are case insensitive.
|
||||
|
||||
@return: True if header name and value are found
|
||||
@rtype: bool
|
||||
"""
|
||||
name = name.lower()
|
||||
value = value.lower()
|
||||
for hname, hvalue in headers:
|
||||
if hname.lower()==name and hvalue.lower()==value:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def http_persistent (response):
|
||||
"""
|
||||
See if the HTTP connection can be kept open according the the
|
||||
header values found in the response object.
|
||||
|
||||
@param response: response instance
|
||||
@type response: httplib.HTTPResponse
|
||||
@return: True if connection is persistent
|
||||
@rtype: bool
|
||||
"""
|
||||
headers = response.getheaders()
|
||||
if response.version == 11:
|
||||
return not has_header_value(headers, 'Connection', 'Close')
|
||||
return has_header_value(headers, "Connection", "Keep-Alive")
|
||||
|
||||
|
||||
def http_keepalive (headers):
|
||||
"""
|
||||
Get HTTP keepalive value, either from the Keep-Alive header or a
|
||||
default value.
|
||||
|
||||
@param headers: HTTP headers
|
||||
@type headers: dict
|
||||
@return: keepalive in seconds
|
||||
@rtype: int
|
||||
"""
|
||||
keepalive = headers.get("Keep-Alive")
|
||||
if keepalive is not None:
|
||||
try:
|
||||
keepalive = int(keepalive[8:].strip())
|
||||
except (ValueError, OverflowError):
|
||||
keepalive = DEFAULT_KEEPALIVE
|
||||
else:
|
||||
keepalive = DEFAULT_KEEPALIVE
|
||||
return keepalive
|
||||
|
||||
|
||||
def get_content_type (headers):
|
||||
"""
|
||||
Get the MIME type from the Content-Type header value, or
|
||||
'application/octet-stream' if not found.
|
||||
|
||||
@return: MIME type
|
||||
@rtype: string
|
||||
"""
|
||||
ptype = headers.get('Content-Type', 'application/octet-stream')
|
||||
if ";" in ptype:
|
||||
# split off not needed extension info
|
||||
ptype = ptype.split(';')[0]
|
||||
return ptype.strip().lower()
|
||||
|
||||
|
||||
def get_charset(headers):
|
||||
"""
|
||||
Get the charset encoding from the Content-Type header value, or
|
||||
None if not found.
|
||||
|
||||
@return: the content charset encoding
|
||||
@rtype: string or None
|
||||
"""
|
||||
from linkcheck.HtmlParser import get_ctype_charset
|
||||
return get_ctype_charset(headers.get('Content-Type', ''))
|
||||
|
||||
|
||||
def get_content_encoding (headers):
|
||||
"""
|
||||
Get the content encoding from the Content-Encoding header value, or
|
||||
an empty string if not found.
|
||||
|
||||
@return: encoding string
|
||||
@rtype: string
|
||||
"""
|
||||
return headers.get("Content-Encoding", "").strip()
|
||||
|
|
@ -22,11 +22,10 @@ import requests
|
|||
from cStringIO import StringIO
|
||||
|
||||
from .. import (log, LOG_CHECK, strformat, fileutil,
|
||||
url as urlutil, LinkCheckerError)
|
||||
from . import (internpaturl, proxysupport, httpheaders as headers)
|
||||
url as urlutil, LinkCheckerError, httputil)
|
||||
from . import (internpaturl, proxysupport)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import linkparse
|
||||
from ..httputil import x509_to_dict
|
||||
# import warnings
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT
|
||||
from requests.sessions import REDIRECT_STATI
|
||||
|
|
@ -182,7 +181,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
|
||||
def _add_response_info(self):
|
||||
"""Set info from established HTTP(S) connection."""
|
||||
self.charset = self.url_connection.apparent_encoding
|
||||
self.charset = httputil.get_charset(self.headers)
|
||||
self.set_content_type()
|
||||
self.add_size_info()
|
||||
|
||||
|
|
@ -205,7 +204,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
else:
|
||||
# using pyopenssl
|
||||
cert = sock.connection.get_peer_certificate()
|
||||
self.ssl_cert = x509_to_dict(cert)
|
||||
self.ssl_cert = httputil.x509_to_dict(cert)
|
||||
log.debug(LOG_CHECK, "Got SSL certificate %s", self.ssl_cert)
|
||||
else:
|
||||
self.ssl_cert = None
|
||||
|
|
@ -222,7 +221,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
|
||||
def set_content_type (self):
|
||||
"""Return content MIME type or empty string."""
|
||||
self.content_type = headers.get_content_type(self.headers)
|
||||
self.content_type = httputil.get_content_type(self.headers)
|
||||
|
||||
def is_redirect(self):
|
||||
"""Check if current response is a redirect."""
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2014 Bastian Kleineidam
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -18,6 +18,11 @@ import base64
|
|||
from datetime import datetime
|
||||
|
||||
|
||||
DEFAULT_KEEPALIVE = 300
|
||||
|
||||
MAX_HEADER_BYTES = 8*1024
|
||||
|
||||
|
||||
def encode_base64 (s):
|
||||
"""Encode given string in base64, excluding trailing newlines."""
|
||||
return base64.b64encode(s)
|
||||
|
|
@ -67,3 +72,93 @@ def asn1_generaltime_to_seconds(timestr):
|
|||
except ValueError:
|
||||
pass
|
||||
return res
|
||||
|
||||
def has_header_value (headers, name, value):
|
||||
"""
|
||||
Look in headers for a specific header name and value.
|
||||
Both name and value are case insensitive.
|
||||
|
||||
@return: True if header name and value are found
|
||||
@rtype: bool
|
||||
"""
|
||||
name = name.lower()
|
||||
value = value.lower()
|
||||
for hname, hvalue in headers:
|
||||
if hname.lower()==name and hvalue.lower()==value:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def http_persistent (response):
|
||||
"""
|
||||
See if the HTTP connection can be kept open according the the
|
||||
header values found in the response object.
|
||||
|
||||
@param response: response instance
|
||||
@type response: httplib.HTTPResponse
|
||||
@return: True if connection is persistent
|
||||
@rtype: bool
|
||||
"""
|
||||
headers = response.getheaders()
|
||||
if response.version == 11:
|
||||
return not has_header_value(headers, 'Connection', 'Close')
|
||||
return has_header_value(headers, "Connection", "Keep-Alive")
|
||||
|
||||
|
||||
def http_keepalive (headers):
|
||||
"""
|
||||
Get HTTP keepalive value, either from the Keep-Alive header or a
|
||||
default value.
|
||||
|
||||
@param headers: HTTP headers
|
||||
@type headers: dict
|
||||
@return: keepalive in seconds
|
||||
@rtype: int
|
||||
"""
|
||||
keepalive = headers.get("Keep-Alive")
|
||||
if keepalive is not None:
|
||||
try:
|
||||
keepalive = int(keepalive[8:].strip())
|
||||
except (ValueError, OverflowError):
|
||||
keepalive = DEFAULT_KEEPALIVE
|
||||
else:
|
||||
keepalive = DEFAULT_KEEPALIVE
|
||||
return keepalive
|
||||
|
||||
|
||||
def get_content_type (headers):
|
||||
"""
|
||||
Get the MIME type from the Content-Type header value, or
|
||||
'application/octet-stream' if not found.
|
||||
|
||||
@return: MIME type
|
||||
@rtype: string
|
||||
"""
|
||||
ptype = headers.get('Content-Type', 'application/octet-stream')
|
||||
if ";" in ptype:
|
||||
# split off not needed extension info
|
||||
ptype = ptype.split(';')[0]
|
||||
return ptype.strip().lower()
|
||||
|
||||
|
||||
def get_charset(headers):
|
||||
"""
|
||||
Get the charset encoding from the Content-Type header value, or
|
||||
None if not found.
|
||||
|
||||
@return: the content charset encoding
|
||||
@rtype: string or None
|
||||
"""
|
||||
from linkcheck.HtmlParser import get_ctype_charset
|
||||
return get_ctype_charset(headers.get('Content-Type', ''))
|
||||
|
||||
|
||||
def get_content_encoding (headers):
|
||||
"""
|
||||
Get the content encoding from the Content-Encoding header value, or
|
||||
an empty string if not found.
|
||||
|
||||
@return: encoding string
|
||||
@rtype: string
|
||||
"""
|
||||
return headers.get("Content-Encoding", "").strip()
|
||||
|
|
|
|||
Loading…
Reference in a new issue