mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Separate URL encoding and content encoding
Ensure users of url_data.encoding are using the URL encoding.
Combined since:
5fc01455 ("Decode content when retrieved, use bs4 to detect encoding if non-Unicode", 2019-09-30)
This commit is contained in:
parent
61071fc5dc
commit
52b9881820
2 changed files with 13 additions and 10 deletions
|
|
@ -175,7 +175,7 @@ class HttpUrl(internpaturl.InternPatternUrl):
|
|||
self.headers = self.url_connection.headers
|
||||
log.debug(LOG_CHECK, "Response headers %s", self.headers)
|
||||
self.set_encoding(self.url_connection.encoding)
|
||||
log.debug(LOG_CHECK, "Response encoding %s", self.encoding)
|
||||
log.debug(LOG_CHECK, "Response encoding %s", self.content_encoding)
|
||||
self._add_ssl_info()
|
||||
|
||||
def _add_response_info(self):
|
||||
|
|
@ -237,9 +237,9 @@ class HttpUrl(internpaturl.InternPatternUrl):
|
|||
# set by Requests.
|
||||
# We fall back to it in UrlBase.get_content() if Beautiful Soup
|
||||
# doesn't return an encoding.
|
||||
self.encoding = None
|
||||
self.content_encoding = None
|
||||
else:
|
||||
self.encoding = encoding
|
||||
self.content_encoding = encoding
|
||||
|
||||
def is_redirect(self):
|
||||
"""Check if current response is a redirect."""
|
||||
|
|
@ -292,7 +292,8 @@ class HttpUrl(internpaturl.InternPatternUrl):
|
|||
if response:
|
||||
log.debug(LOG_CHECK, "Redirected response headers %s", response.headers)
|
||||
self.set_encoding(response.encoding)
|
||||
log.debug(LOG_CHECK, "Redirected response encoding %s", self.encoding)
|
||||
log.debug(
|
||||
LOG_CHECK, "Redirected response encoding %s", self.content_encoding)
|
||||
|
||||
def check_response(self):
|
||||
"""Check final result and log it."""
|
||||
|
|
@ -327,7 +328,7 @@ class HttpUrl(internpaturl.InternPatternUrl):
|
|||
self.set_result(_("OK"))
|
||||
|
||||
def get_content(self):
|
||||
return super().get_content(self.encoding)
|
||||
return super().get_content(self.content_encoding)
|
||||
|
||||
def read_content(self):
|
||||
"""Return data and data size for this URL.
|
||||
|
|
|
|||
|
|
@ -253,6 +253,8 @@ class UrlBase:
|
|||
self.url_connection = None
|
||||
# data of url content, (data == None) means no data is available
|
||||
self.data = None
|
||||
# url content data encoding
|
||||
self.content_encoding = None
|
||||
# url content as a Unicode string
|
||||
self.text = None
|
||||
# url content as a Beautiful Soup object
|
||||
|
|
@ -759,9 +761,9 @@ class UrlBase:
|
|||
log.debug(
|
||||
LOG_CHECK, "Beautiful Soup detected %s", self.soup.original_encoding
|
||||
)
|
||||
self.encoding = self.soup.original_encoding or 'ISO-8859-1'
|
||||
log.debug(LOG_CHECK, "Content encoding %s", self.encoding)
|
||||
self.text = self.data.decode(self.encoding)
|
||||
self.content_encoding = self.soup.original_encoding or 'ISO-8859-1'
|
||||
log.debug(LOG_CHECK, "Content encoding %s", self.content_encoding)
|
||||
self.text = self.data.decode(self.content_encoding)
|
||||
return self.text
|
||||
|
||||
def read_content(self):
|
||||
|
|
@ -794,7 +796,7 @@ class UrlBase:
|
|||
def add_url(self, url, line=0, column=0, page=0, name="", base=None):
|
||||
"""Add new URL to queue."""
|
||||
if base:
|
||||
base_ref = urlutil.url_norm(base, encoding=self.encoding)[0]
|
||||
base_ref = urlutil.url_norm(base, encoding=self.content_encoding)[0]
|
||||
else:
|
||||
base_ref = None
|
||||
url_data = get_url_from(
|
||||
|
|
@ -808,7 +810,7 @@ class UrlBase:
|
|||
page=page,
|
||||
name=name,
|
||||
parent_content_type=self.content_type,
|
||||
url_encoding=self.encoding,
|
||||
url_encoding=self.content_encoding,
|
||||
)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue