diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 54469ec0..4260672a 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -175,7 +175,7 @@ class HttpUrl(internpaturl.InternPatternUrl): self.headers = self.url_connection.headers log.debug(LOG_CHECK, "Response headers %s", self.headers) self.set_encoding(self.url_connection.encoding) - log.debug(LOG_CHECK, "Response encoding %s", self.encoding) + log.debug(LOG_CHECK, "Response encoding %s", self.content_encoding) self._add_ssl_info() def _add_response_info(self): @@ -237,9 +237,9 @@ class HttpUrl(internpaturl.InternPatternUrl): # set by Requests. # We fall back to it in UrlBase.get_content() if Beautiful Soup # doesn't return an encoding. - self.encoding = None + self.content_encoding = None else: - self.encoding = encoding + self.content_encoding = encoding def is_redirect(self): """Check if current response is a redirect.""" @@ -292,7 +292,8 @@ class HttpUrl(internpaturl.InternPatternUrl): if response: log.debug(LOG_CHECK, "Redirected response headers %s", response.headers) self.set_encoding(response.encoding) - log.debug(LOG_CHECK, "Redirected response encoding %s", self.encoding) + log.debug( + LOG_CHECK, "Redirected response encoding %s", self.content_encoding) def check_response(self): """Check final result and log it.""" @@ -327,7 +328,7 @@ class HttpUrl(internpaturl.InternPatternUrl): self.set_result(_("OK")) def get_content(self): - return super().get_content(self.encoding) + return super().get_content(self.content_encoding) def read_content(self): """Return data and data size for this URL. diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index e7264dcb..9e9e16b9 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -253,6 +253,8 @@ class UrlBase: self.url_connection = None # data of url content, (data == None) means no data is available self.data = None + # url content data encoding + self.content_encoding = None # url content as a Unicode string self.text = None # url content as a Beautiful Soup object @@ -759,9 +761,9 @@ class UrlBase: log.debug( LOG_CHECK, "Beautiful Soup detected %s", self.soup.original_encoding ) - self.encoding = self.soup.original_encoding or 'ISO-8859-1' - log.debug(LOG_CHECK, "Content encoding %s", self.encoding) - self.text = self.data.decode(self.encoding) + self.content_encoding = self.soup.original_encoding or 'ISO-8859-1' + log.debug(LOG_CHECK, "Content encoding %s", self.content_encoding) + self.text = self.data.decode(self.content_encoding) return self.text def read_content(self): @@ -794,7 +796,7 @@ class UrlBase: def add_url(self, url, line=0, column=0, page=0, name="", base=None): """Add new URL to queue.""" if base: - base_ref = urlutil.url_norm(base, encoding=self.encoding)[0] + base_ref = urlutil.url_norm(base, encoding=self.content_encoding)[0] else: base_ref = None url_data = get_url_from( @@ -808,7 +810,7 @@ class UrlBase: page=page, name=name, parent_content_type=self.content_type, - url_encoding=self.encoding, + url_encoding=self.content_encoding, ) self.aggregate.urlqueue.put(url_data)