Decode content when retrieved, use bs4 to detect encoding if non-Unicode

UrlBase has been modified as follows: - the "data" variable now holds bytes - decoded content is stored in a new variable "text" - functionality from get_content() has been split out into get_raw_content() which returns "data" and download_content() which calls read_content() and sets the download related variables. This allows for subclasses to do their own decoding and parsers to use bytes.
2026-04-26 00:54:43 +00:00 · 2019-09-30 19:46:24 +01:00 · 2019-09-30 19:46:24 +01:00 · 5fc01455b7
commit 5fc01455b7
parent 0c90c718bf
3 changed files with 39 additions and 19 deletions
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -38,13 +38,16 @@ import time
 import errno
 import socket
 import select
-try:
+from io import BytesIO
    from cStringIO import StringIO
 except ImportError:
    # Python 3
    from io import StringIO
 from builtins import str as str_text
 from future.utils import python_2_unicode_compatible
 from warnings import filterwarnings
 filterwarnings("ignore",
    message="The soupsieve package is not installed. CSS selectors cannot be used.",
    category=UserWarning, module="bs4")
 from bs4 import BeautifulSoup
 from . import absolute_url, get_url_from
 from .. import (log, LOG_CHECK,
@ -216,6 +219,8 @@ class UrlBase (object):
        self.url_connection = None
        # data of url content,  (data == None) means no data is available
        self.data = None
        # url content as a Unicode string
        self.text = None
        # cache url is set by build_url() calling set_cache_url()
        self.cache_url = None
        # extern flags (is_extern, is_strict)
@ -625,24 +630,35 @@ class UrlBase (object):
        """Indicate wether url get_content() can be called."""
        return self.size <= self.aggregate.config["maxfilesizedownload"]
-    def get_content (self):
+    def download_content(self):
-        """Precondition: url_connection is an opened URL."""
+        log.debug(LOG_CHECK, "Get content of %r", self.url)
-        if self.data is None:
+        t = time.time()
-            log.debug(LOG_CHECK, "Get content of %r", self.url)
+        content = self.read_content()
-            t = time.time()
+        self.size = len(content)
-            self.data = self.read_content()
+        self.dltime = time.time() - t
-            self.size = len(self.data)
+        if self.size == 0:
-            self.dltime = time.time() - t
+            self.add_warning(_("Content size is zero."),
            if self.size == 0:
                self.add_warning(_("Content size is zero."),
                             tag=WARN_URL_CONTENT_SIZE_ZERO)
-            else:
+        else:
-                self.aggregate.add_downloaded_bytes(self.size)
+            self.aggregate.add_downloaded_bytes(self.size)
        return content
    def get_raw_content(self):
        if self.data is None:
            self.data = self.download_content()
        return self.data
    def get_content (self):
        if self.text is None:
            self.get_raw_content()
            soup = BeautifulSoup(self.data, "html.parser")
            self.text = self.data.decode(soup.original_encoding)
            self.encoding = soup.original_encoding
        return self.text
    def read_content(self):
        """Return data for this URL. Can be overridden in subclasses."""
-        buf = StringIO()
+        buf = BytesIO()
        data = self.read_content_chunk()
        while data:
            if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
@ -652,7 +668,9 @@ class UrlBase (object):
        return buf.getvalue()
    def read_content_chunk(self):
-        """Read one chunk of content from this URL."""
+        """Read one chunk of content from this URL.
        Precondition: url_connection is an opened URL.
        """
        return self.url_connection.read(self.ReadChunkBytes)
    def get_user_password (self):
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 # required:
 bs4
 requests >= 2.4
 pyxdg
 dnspython
--- a/setup.py
+++ b/setup.py
@ -503,6 +503,7 @@ args = dict(
    install_requires = [
        'requests >= 2.4',
        'dnspython',
        'bs4',
        'pyxdg',
        'future',
    ],