Create one function to make soup objects

This commit is contained in:
Chris Mayo 2020-04-08 20:03:35 +01:00
parent 9d8d251d06
commit 40f43ae41c
3 changed files with 7 additions and 13 deletions

View file

@ -27,6 +27,10 @@ filterwarnings("ignore",
from bs4 import BeautifulSoup, Tag
def make_soup(markup, from_encoding=None):
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
multi_valued_attributes=None)
class Parser(object):
handler = None
encoding = None

View file

@ -17,7 +17,6 @@
"""
Handle http links.
"""
from bs4 import BeautifulSoup
import requests
# The validity of SSL certs is ignored to be able
# the check the URL and recurse into it.
@ -305,9 +304,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def get_content(self):
if self.text is None:
self.get_raw_content()
self.soup = BeautifulSoup(self.data, "html.parser",
multi_valued_attributes=None,
from_encoding=self.encoding)
self.soup = htmlsax.make_soup(self.data, self.encoding)
self.text = self.data.decode(self.soup.original_encoding)
return self.text

View file

@ -41,17 +41,11 @@ import select
from io import BytesIO
from builtins import str as str_text
from future.utils import python_2_unicode_compatible
from warnings import filterwarnings
filterwarnings("ignore",
message="The soupsieve package is not installed. CSS selectors cannot be used.",
category=UserWarning, module="bs4")
from bs4 import BeautifulSoup
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
from ..HtmlParser import htmlsax
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@ -657,8 +651,7 @@ class UrlBase (object):
def get_content (self):
if self.text is None:
self.get_raw_content()
self.soup = BeautifulSoup(self.data, "html.parser",
multi_valued_attributes=None)
self.soup = htmlsax.make_soup(self.data)
self.text = self.data.decode(self.soup.original_encoding)
self.encoding = self.soup.original_encoding
return self.text