Reuse soup object used for detecting encoding in the HTML parser

This commit is contained in:
Chris Mayo 2019-10-05 19:38:57 +01:00
parent 978042a54e
commit 153e53ba03
4 changed files with 22 additions and 10 deletions

View file

@ -46,7 +46,11 @@ class Parser(object):
self.html_doc = StringIO()
self.html_doc.write(feed_text)
def feed_soup(self, soup):
self.soup = soup
def reset(self):
self.soup = None
self.html_doc = None
self.tag_lineno = None
self.tag_column = None
@ -104,10 +108,11 @@ class Parser(object):
self.handler.characters(content)
def flush(self):
soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
if hasattr(soup, 'contents'):
self.parse_contents(soup.contents)
self.encoding = soup.original_encoding
if self.soup is None:
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
if hasattr(self.soup, 'contents'):
self.parse_contents(self.soup.contents)
self.encoding = self.soup.original_encoding
def debug(self, text):
raise NotImplementedError("debug is not implemented")

View file

@ -86,7 +86,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
handler.parser = parser
# parse
try:
parser.feed(self.get_raw_content())
parser.feed_soup(self.get_soup())
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)

View file

@ -221,6 +221,8 @@ class UrlBase (object):
self.data = None
# url content as a Unicode string
self.text = None
# url content as a Beautiful Soup object
self.soup = None
# cache url is set by build_url() calling set_cache_url()
self.cache_url = None
# extern flags (is_extern, is_strict)
@ -643,6 +645,11 @@ class UrlBase (object):
self.aggregate.add_downloaded_bytes(self.size)
return content
def get_soup(self):
if self.soup is None:
self.get_content()
return self.soup
def get_raw_content(self):
if self.data is None:
self.data = self.download_content()
@ -651,9 +658,9 @@ class UrlBase (object):
def get_content (self):
if self.text is None:
self.get_raw_content()
soup = BeautifulSoup(self.data, "html.parser")
self.text = self.data.decode(soup.original_encoding)
self.encoding = soup.original_encoding
self.soup = BeautifulSoup(self.data, "html.parser")
self.text = self.data.decode(self.soup.original_encoding)
self.encoding = self.soup.original_encoding
return self.text
def read_content(self):

View file

@ -128,9 +128,9 @@ def find_links (url_data, callback, tags):
handler.parser = parser
# parse
try:
content = url_data.get_raw_content()
soup = url_data.get_soup()
with parse_mutex:
parser.feed(content)
parser.feed_soup(soup)
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)