mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-16 18:41:07 +00:00
Ignore an encoding of ISO-8859-1 returned by Requests
ISO-8859-1 is a fallback for Requests and causes us to mangle UTF-8
content.
Requests' utils.py:
def get_encoding_from_headers(headers):
"""Returns encodings from given HTTP Header Dict.
:param headers: dictionary to extract encoding from.
:rtype: str
"""
content_type = headers.get('content-type')
if not content_type:
return None
content_type, params = _parse_content_type_header(content_type)
if 'charset' in params:
return params['charset'].strip("'\"")
if 'text' in content_type:
return 'ISO-8859-1'
if 'application/json' in content_type:
# Assume UTF-8 based on RFC 4627: https://www.ietf.org/rfc/rfc4627.txt since the charset was unset
return 'utf-8'
This commit is contained in:
parent
a78c78a803
commit
c89c617a58
1 changed files with 8 additions and 1 deletions
|
|
@ -177,7 +177,14 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
log.debug(LOG_CHECK, "Request headers %s", request.headers)
|
||||
self.url_connection = self.session.send(request, **kwargs)
|
||||
self.headers = self.url_connection.headers
|
||||
self.encoding = self.url_connection.encoding
|
||||
log.debug(LOG_CHECK, "Response headers %s", self.headers)
|
||||
if self.url_connection.encoding == "ISO-8859-1":
|
||||
# Can't trust ISO-8859-1 because it is Requests' fallback for text
|
||||
# content-types. We fall back to it in UrlBase.get_content() if
|
||||
# Beautiful Soup doesn't return an encoding.
|
||||
self.encoding = None
|
||||
else:
|
||||
self.encoding = self.url_connection.encoding
|
||||
log.debug(LOG_CHECK, "Response encoding %s", self.encoding)
|
||||
self._add_ssl_info()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue