Ignore an encoding of ISO-8859-1 returned by Requests

ISO-8859-1 is a fallback for Requests and causes us to mangle UTF-8
content.

Requests' utils.py:

def get_encoding_from_headers(headers):
    """Returns encodings from given HTTP Header Dict.

    :param headers: dictionary to extract encoding from.
    :rtype: str
    """

    content_type = headers.get('content-type')

    if not content_type:
        return None

    content_type, params = _parse_content_type_header(content_type)

    if 'charset' in params:
        return params['charset'].strip("'\"")

    if 'text' in content_type:
        return 'ISO-8859-1'

    if 'application/json' in content_type:
        # Assume UTF-8 based on RFC 4627: https://www.ietf.org/rfc/rfc4627.txt since the charset was unset
        return 'utf-8'
This commit is contained in:
Chris Mayo 2021-11-29 19:52:37 +00:00
parent a78c78a803
commit c89c617a58

View file

@ -177,7 +177,14 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
log.debug(LOG_CHECK, "Request headers %s", request.headers)
self.url_connection = self.session.send(request, **kwargs)
self.headers = self.url_connection.headers
self.encoding = self.url_connection.encoding
log.debug(LOG_CHECK, "Response headers %s", self.headers)
if self.url_connection.encoding == "ISO-8859-1":
# Can't trust ISO-8859-1 because it is Requests' fallback for text
# content-types. We fall back to it in UrlBase.get_content() if
# Beautiful Soup doesn't return an encoding.
self.encoding = None
else:
self.encoding = self.url_connection.encoding
log.debug(LOG_CHECK, "Response encoding %s", self.encoding)
self._add_ssl_info()