mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-16 20:31:01 +00:00
Create one function to make soup objects
This commit is contained in:
parent
9d8d251d06
commit
40f43ae41c
3 changed files with 7 additions and 13 deletions
|
|
@ -27,6 +27,10 @@ filterwarnings("ignore",
|
|||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
|
||||
def make_soup(markup, from_encoding=None):
|
||||
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
|
||||
multi_valued_attributes=None)
|
||||
|
||||
class Parser(object):
|
||||
handler = None
|
||||
encoding = None
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@
|
|||
"""
|
||||
Handle http links.
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
# The validity of SSL certs is ignored to be able
|
||||
# the check the URL and recurse into it.
|
||||
|
|
@ -305,9 +304,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
def get_content(self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = BeautifulSoup(self.data, "html.parser",
|
||||
multi_valued_attributes=None,
|
||||
from_encoding=self.encoding)
|
||||
self.soup = htmlsax.make_soup(self.data, self.encoding)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
return self.text
|
||||
|
||||
|
|
|
|||
|
|
@ -41,17 +41,11 @@ import select
|
|||
from io import BytesIO
|
||||
from builtins import str as str_text
|
||||
from future.utils import python_2_unicode_compatible
|
||||
from warnings import filterwarnings
|
||||
|
||||
filterwarnings("ignore",
|
||||
message="The soupsieve package is not installed. CSS selectors cannot be used.",
|
||||
category=UserWarning, module="bs4")
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from . import absolute_url, get_url_from
|
||||
from .. import (log, LOG_CHECK,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..network import iputil
|
||||
from .const import (WARN_URL_EFFECTIVE_URL,
|
||||
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
|
||||
|
|
@ -657,8 +651,7 @@ class UrlBase (object):
|
|||
def get_content (self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = BeautifulSoup(self.data, "html.parser",
|
||||
multi_valued_attributes=None)
|
||||
self.soup = htmlsax.make_soup(self.data)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
self.encoding = self.soup.original_encoding
|
||||
return self.text
|
||||
|
|
|
|||
Loading…
Reference in a new issue