From c89c617a58b610f3aadfefd9b2d7efdc26681f52 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 29 Nov 2021 19:52:37 +0000 Subject: [PATCH] Ignore an encoding of ISO-8859-1 returned by Requests ISO-8859-1 is a fallback for Requests and causes us to mangle UTF-8 content. Requests' utils.py: def get_encoding_from_headers(headers): """Returns encodings from given HTTP Header Dict. :param headers: dictionary to extract encoding from. :rtype: str """ content_type = headers.get('content-type') if not content_type: return None content_type, params = _parse_content_type_header(content_type) if 'charset' in params: return params['charset'].strip("'\"") if 'text' in content_type: return 'ISO-8859-1' if 'application/json' in content_type: # Assume UTF-8 based on RFC 4627: https://www.ietf.org/rfc/rfc4627.txt since the charset was unset return 'utf-8' --- linkcheck/checker/httpurl.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 90b81b04..7b72f3f4 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -177,7 +177,14 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): log.debug(LOG_CHECK, "Request headers %s", request.headers) self.url_connection = self.session.send(request, **kwargs) self.headers = self.url_connection.headers - self.encoding = self.url_connection.encoding + log.debug(LOG_CHECK, "Response headers %s", self.headers) + if self.url_connection.encoding == "ISO-8859-1": + # Can't trust ISO-8859-1 because it is Requests' fallback for text + # content-types. We fall back to it in UrlBase.get_content() if + # Beautiful Soup doesn't return an encoding. + self.encoding = None + else: + self.encoding = self.url_connection.encoding log.debug(LOG_CHECK, "Response encoding %s", self.encoding) self._add_ssl_info()