Separate URL encoding and content encoding

Ensure users of url_data.encoding are using the URL encoding.

Combined since:
5fc01455 ("Decode content when retrieved, use bs4 to detect encoding if non-Unicode", 2019-09-30)
This commit is contained in:
Chris Mayo 2022-09-29 19:21:11 +01:00
parent 61071fc5dc
commit 52b9881820
2 changed files with 13 additions and 10 deletions

View file

@ -175,7 +175,7 @@ class HttpUrl(internpaturl.InternPatternUrl):
self.headers = self.url_connection.headers
log.debug(LOG_CHECK, "Response headers %s", self.headers)
self.set_encoding(self.url_connection.encoding)
log.debug(LOG_CHECK, "Response encoding %s", self.encoding)
log.debug(LOG_CHECK, "Response encoding %s", self.content_encoding)
self._add_ssl_info()
def _add_response_info(self):
@ -237,9 +237,9 @@ class HttpUrl(internpaturl.InternPatternUrl):
# set by Requests.
# We fall back to it in UrlBase.get_content() if Beautiful Soup
# doesn't return an encoding.
self.encoding = None
self.content_encoding = None
else:
self.encoding = encoding
self.content_encoding = encoding
def is_redirect(self):
"""Check if current response is a redirect."""
@ -292,7 +292,8 @@ class HttpUrl(internpaturl.InternPatternUrl):
if response:
log.debug(LOG_CHECK, "Redirected response headers %s", response.headers)
self.set_encoding(response.encoding)
log.debug(LOG_CHECK, "Redirected response encoding %s", self.encoding)
log.debug(
LOG_CHECK, "Redirected response encoding %s", self.content_encoding)
def check_response(self):
"""Check final result and log it."""
@ -327,7 +328,7 @@ class HttpUrl(internpaturl.InternPatternUrl):
self.set_result(_("OK"))
def get_content(self):
return super().get_content(self.encoding)
return super().get_content(self.content_encoding)
def read_content(self):
"""Return data and data size for this URL.

View file

@ -253,6 +253,8 @@ class UrlBase:
self.url_connection = None
# data of url content, (data == None) means no data is available
self.data = None
# url content data encoding
self.content_encoding = None
# url content as a Unicode string
self.text = None
# url content as a Beautiful Soup object
@ -759,9 +761,9 @@ class UrlBase:
log.debug(
LOG_CHECK, "Beautiful Soup detected %s", self.soup.original_encoding
)
self.encoding = self.soup.original_encoding or 'ISO-8859-1'
log.debug(LOG_CHECK, "Content encoding %s", self.encoding)
self.text = self.data.decode(self.encoding)
self.content_encoding = self.soup.original_encoding or 'ISO-8859-1'
log.debug(LOG_CHECK, "Content encoding %s", self.content_encoding)
self.text = self.data.decode(self.content_encoding)
return self.text
def read_content(self):
@ -794,7 +796,7 @@ class UrlBase:
def add_url(self, url, line=0, column=0, page=0, name="", base=None):
"""Add new URL to queue."""
if base:
base_ref = urlutil.url_norm(base, encoding=self.encoding)[0]
base_ref = urlutil.url_norm(base, encoding=self.content_encoding)[0]
else:
base_ref = None
url_data = get_url_from(
@ -808,7 +810,7 @@ class UrlBase:
page=page,
name=name,
parent_content_type=self.content_type,
url_encoding=self.encoding,
url_encoding=self.content_encoding,
)
self.aggregate.urlqueue.put(url_data)