From a04214465a9952334ddce9eec0e1732ff40ecfac Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 6 Dec 2021 19:34:31 +0000 Subject: [PATCH] Update HttpUrl.encoding after following redirects --- linkcheck/checker/httpurl.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 0b0c6c09..19ecf8f1 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -176,13 +176,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.url_connection = self.session.send(request, **kwargs) self.headers = self.url_connection.headers log.debug(LOG_CHECK, "Response headers %s", self.headers) - if self.url_connection.encoding == "ISO-8859-1": - # Can't trust ISO-8859-1 because it is Requests' fallback for text - # content-types. We fall back to it in UrlBase.get_content() if - # Beautiful Soup doesn't return an encoding. - self.encoding = None - else: - self.encoding = self.url_connection.encoding + self.set_encoding(self.url_connection.encoding) log.debug(LOG_CHECK, "Response encoding %s", self.encoding) self._add_ssl_info() @@ -236,6 +230,19 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): """Return content MIME type or empty string.""" self.content_type = httputil.get_content_type(self.headers) + def set_encoding(self, encoding): + """Set content encoding""" + if encoding == "ISO-8859-1": + # Although RFC 2616 (HTTP/1.1) says that text data in a non-ISO-8859-1 + # (or subset) character set must be labelled with a charset, + # that is not always the case and then the default ISO-8859-1 is + # set by Requests. + # We fall back to it in UrlBase.get_content() if Beautiful Soup + # doesn't return an encoding. + self.encoding = None + else: + self.encoding = encoding + def is_redirect(self): """Check if current response is a redirect.""" return ( @@ -286,6 +293,10 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): if self.is_redirect(): # run connection plugins for old connection self.aggregate.plugin_manager.run_connection_plugins(self) + if response: + log.debug(LOG_CHECK, "Redirected response headers %s", response.headers) + self.set_encoding(response.encoding) + log.debug(LOG_CHECK, "Redirected response encoding %s", self.encoding) def check_response(self): """Check final result and log it."""