mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-26 00:54:43 +00:00
Decode content when retrieved, use bs4 to detect encoding if non-Unicode
UrlBase has been modified as follows: - the "data" variable now holds bytes - decoded content is stored in a new variable "text" - functionality from get_content() has been split out into get_raw_content() which returns "data" and download_content() which calls read_content() and sets the download related variables. This allows for subclasses to do their own decoding and parsers to use bytes.
This commit is contained in:
parent
0c90c718bf
commit
5fc01455b7
3 changed files with 39 additions and 19 deletions
|
|
@ -38,13 +38,16 @@ import time
|
||||||
import errno
|
import errno
|
||||||
import socket
|
import socket
|
||||||
import select
|
import select
|
||||||
try:
|
from io import BytesIO
|
||||||
from cStringIO import StringIO
|
|
||||||
except ImportError:
|
|
||||||
# Python 3
|
|
||||||
from io import StringIO
|
|
||||||
from builtins import str as str_text
|
from builtins import str as str_text
|
||||||
from future.utils import python_2_unicode_compatible
|
from future.utils import python_2_unicode_compatible
|
||||||
|
from warnings import filterwarnings
|
||||||
|
|
||||||
|
filterwarnings("ignore",
|
||||||
|
message="The soupsieve package is not installed. CSS selectors cannot be used.",
|
||||||
|
category=UserWarning, module="bs4")
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from . import absolute_url, get_url_from
|
from . import absolute_url, get_url_from
|
||||||
from .. import (log, LOG_CHECK,
|
from .. import (log, LOG_CHECK,
|
||||||
|
|
@ -216,6 +219,8 @@ class UrlBase (object):
|
||||||
self.url_connection = None
|
self.url_connection = None
|
||||||
# data of url content, (data == None) means no data is available
|
# data of url content, (data == None) means no data is available
|
||||||
self.data = None
|
self.data = None
|
||||||
|
# url content as a Unicode string
|
||||||
|
self.text = None
|
||||||
# cache url is set by build_url() calling set_cache_url()
|
# cache url is set by build_url() calling set_cache_url()
|
||||||
self.cache_url = None
|
self.cache_url = None
|
||||||
# extern flags (is_extern, is_strict)
|
# extern flags (is_extern, is_strict)
|
||||||
|
|
@ -625,24 +630,35 @@ class UrlBase (object):
|
||||||
"""Indicate wether url get_content() can be called."""
|
"""Indicate wether url get_content() can be called."""
|
||||||
return self.size <= self.aggregate.config["maxfilesizedownload"]
|
return self.size <= self.aggregate.config["maxfilesizedownload"]
|
||||||
|
|
||||||
def get_content (self):
|
def download_content(self):
|
||||||
"""Precondition: url_connection is an opened URL."""
|
log.debug(LOG_CHECK, "Get content of %r", self.url)
|
||||||
if self.data is None:
|
t = time.time()
|
||||||
log.debug(LOG_CHECK, "Get content of %r", self.url)
|
content = self.read_content()
|
||||||
t = time.time()
|
self.size = len(content)
|
||||||
self.data = self.read_content()
|
self.dltime = time.time() - t
|
||||||
self.size = len(self.data)
|
if self.size == 0:
|
||||||
self.dltime = time.time() - t
|
self.add_warning(_("Content size is zero."),
|
||||||
if self.size == 0:
|
|
||||||
self.add_warning(_("Content size is zero."),
|
|
||||||
tag=WARN_URL_CONTENT_SIZE_ZERO)
|
tag=WARN_URL_CONTENT_SIZE_ZERO)
|
||||||
else:
|
else:
|
||||||
self.aggregate.add_downloaded_bytes(self.size)
|
self.aggregate.add_downloaded_bytes(self.size)
|
||||||
|
return content
|
||||||
|
|
||||||
|
def get_raw_content(self):
|
||||||
|
if self.data is None:
|
||||||
|
self.data = self.download_content()
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
|
def get_content (self):
|
||||||
|
if self.text is None:
|
||||||
|
self.get_raw_content()
|
||||||
|
soup = BeautifulSoup(self.data, "html.parser")
|
||||||
|
self.text = self.data.decode(soup.original_encoding)
|
||||||
|
self.encoding = soup.original_encoding
|
||||||
|
return self.text
|
||||||
|
|
||||||
def read_content(self):
|
def read_content(self):
|
||||||
"""Return data for this URL. Can be overridden in subclasses."""
|
"""Return data for this URL. Can be overridden in subclasses."""
|
||||||
buf = StringIO()
|
buf = BytesIO()
|
||||||
data = self.read_content_chunk()
|
data = self.read_content_chunk()
|
||||||
while data:
|
while data:
|
||||||
if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
|
if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
|
||||||
|
|
@ -652,7 +668,9 @@ class UrlBase (object):
|
||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
def read_content_chunk(self):
|
def read_content_chunk(self):
|
||||||
"""Read one chunk of content from this URL."""
|
"""Read one chunk of content from this URL.
|
||||||
|
Precondition: url_connection is an opened URL.
|
||||||
|
"""
|
||||||
return self.url_connection.read(self.ReadChunkBytes)
|
return self.url_connection.read(self.ReadChunkBytes)
|
||||||
|
|
||||||
def get_user_password (self):
|
def get_user_password (self):
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
# required:
|
# required:
|
||||||
|
bs4
|
||||||
requests >= 2.4
|
requests >= 2.4
|
||||||
pyxdg
|
pyxdg
|
||||||
dnspython
|
dnspython
|
||||||
|
|
|
||||||
1
setup.py
1
setup.py
|
|
@ -503,6 +503,7 @@ args = dict(
|
||||||
install_requires = [
|
install_requires = [
|
||||||
'requests >= 2.4',
|
'requests >= 2.4',
|
||||||
'dnspython',
|
'dnspython',
|
||||||
|
'bs4',
|
||||||
'pyxdg',
|
'pyxdg',
|
||||||
'future',
|
'future',
|
||||||
],
|
],
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue