mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-28 10:04:43 +00:00
Reuse soup object used for detecting encoding in the HTML parser
This commit is contained in:
parent
978042a54e
commit
153e53ba03
4 changed files with 22 additions and 10 deletions
|
|
@ -46,7 +46,11 @@ class Parser(object):
|
|||
self.html_doc = StringIO()
|
||||
self.html_doc.write(feed_text)
|
||||
|
||||
def feed_soup(self, soup):
|
||||
self.soup = soup
|
||||
|
||||
def reset(self):
|
||||
self.soup = None
|
||||
self.html_doc = None
|
||||
self.tag_lineno = None
|
||||
self.tag_column = None
|
||||
|
|
@ -104,10 +108,11 @@ class Parser(object):
|
|||
self.handler.characters(content)
|
||||
|
||||
def flush(self):
|
||||
soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
|
||||
if hasattr(soup, 'contents'):
|
||||
self.parse_contents(soup.contents)
|
||||
self.encoding = soup.original_encoding
|
||||
if self.soup is None:
|
||||
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
|
||||
if hasattr(self.soup, 'contents'):
|
||||
self.parse_contents(self.soup.contents)
|
||||
self.encoding = self.soup.original_encoding
|
||||
|
||||
def debug(self, text):
|
||||
raise NotImplementedError("debug is not implemented")
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
handler.parser = parser
|
||||
# parse
|
||||
try:
|
||||
parser.feed(self.get_raw_content())
|
||||
parser.feed_soup(self.get_soup())
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
|
|
|
|||
|
|
@ -221,6 +221,8 @@ class UrlBase (object):
|
|||
self.data = None
|
||||
# url content as a Unicode string
|
||||
self.text = None
|
||||
# url content as a Beautiful Soup object
|
||||
self.soup = None
|
||||
# cache url is set by build_url() calling set_cache_url()
|
||||
self.cache_url = None
|
||||
# extern flags (is_extern, is_strict)
|
||||
|
|
@ -643,6 +645,11 @@ class UrlBase (object):
|
|||
self.aggregate.add_downloaded_bytes(self.size)
|
||||
return content
|
||||
|
||||
def get_soup(self):
|
||||
if self.soup is None:
|
||||
self.get_content()
|
||||
return self.soup
|
||||
|
||||
def get_raw_content(self):
|
||||
if self.data is None:
|
||||
self.data = self.download_content()
|
||||
|
|
@ -651,9 +658,9 @@ class UrlBase (object):
|
|||
def get_content (self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
soup = BeautifulSoup(self.data, "html.parser")
|
||||
self.text = self.data.decode(soup.original_encoding)
|
||||
self.encoding = soup.original_encoding
|
||||
self.soup = BeautifulSoup(self.data, "html.parser")
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
self.encoding = self.soup.original_encoding
|
||||
return self.text
|
||||
|
||||
def read_content(self):
|
||||
|
|
|
|||
|
|
@ -128,9 +128,9 @@ def find_links (url_data, callback, tags):
|
|||
handler.parser = parser
|
||||
# parse
|
||||
try:
|
||||
content = url_data.get_raw_content()
|
||||
soup = url_data.get_soup()
|
||||
with parse_mutex:
|
||||
parser.feed(content)
|
||||
parser.feed_soup(soup)
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
|
|
|
|||
Loading…
Reference in a new issue