diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index 7975b6e7..df35d722 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -17,7 +17,6 @@
HTML parser implemented using Beautiful Soup and html.parser.
"""
-from io import BytesIO, StringIO
from warnings import filterwarnings
filterwarnings("ignore",
@@ -27,64 +26,39 @@ filterwarnings("ignore",
from bs4 import BeautifulSoup, Tag
+def make_soup(markup, from_encoding=None):
+ return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
+ multi_valued_attributes=None)
+
class Parser(object):
handler = None
- encoding = None
def __init__(self, handler):
self.handler = handler
- self.reset()
-
- def feed(self, feed_text):
- if not self.html_doc:
- if isinstance(feed_text, bytes):
- self.html_doc = BytesIO()
- else:
- self.html_doc = StringIO()
- self.html_doc.write(feed_text)
def feed_soup(self, soup):
- self.soup = soup
-
- def reset(self):
- self.soup = None
- self.html_doc = None
- self.tag_lineno = None
- self.tag_column = None
+ self.parse_contents(soup.contents)
def parse_contents(self, contents):
for content in contents:
if isinstance(content, Tag):
- self.tag_lineno = content.sourceline
- self.tag_column = None if content.sourcepos is None \
+ tag_column = None if content.sourcepos is None \
else content.sourcepos + 1
if content.is_empty_element:
self.handler.start_end_element(
content.name, content.attrs, content.text.strip(),
+ content.sourceline, tag_column
)
else:
self.handler.start_element(
content.name, content.attrs, content.text.strip(),
+ content.sourceline, tag_column
)
if hasattr(content, 'contents'): # recursion
self.parse_contents(content.contents)
if hasattr(self.handler, 'end_element'):
self.handler.end_element(content.name)
- def flush(self):
- if self.soup is None:
- self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
- multi_valued_attributes=None)
- if hasattr(self.soup, 'contents'):
- self.parse_contents(self.soup.contents)
- self.encoding = self.soup.original_encoding
-
- def lineno(self):
- return self.tag_lineno
-
- def column(self):
- return self.tag_column
-
def parser(handler=None):
return Parser(handler)
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 9e6459ef..46dab657 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -17,7 +17,6 @@
"""
Handle http links.
"""
-from bs4 import BeautifulSoup
import requests
# The validity of SSL certs is ignored to be able
# the check the URL and recurse into it.
@@ -83,17 +82,12 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# construct parser object
handler = linkparse.MetaRobotsFinder()
parser = htmlsax.parser(handler)
- handler.parser = parser
# parse
try:
parser.feed_soup(self.get_soup())
- parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
- # break cyclic dependencies
- handler.parser = None
- parser.handler = None
return handler.follow
def add_size_info (self):
@@ -309,9 +303,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def get_content(self):
if self.text is None:
self.get_raw_content()
- self.soup = BeautifulSoup(self.data, "html.parser",
- multi_valued_attributes=None,
- from_encoding=self.encoding)
+ self.soup = htmlsax.make_soup(self.data, self.encoding)
self.text = self.data.decode(self.soup.original_encoding)
return self.text
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index ca924ad3..bb7debef 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -41,17 +41,11 @@ import select
from io import BytesIO
from builtins import str as str_text
from future.utils import python_2_unicode_compatible
-from warnings import filterwarnings
-
-filterwarnings("ignore",
- message="The soupsieve package is not installed. CSS selectors cannot be used.",
- category=UserWarning, module="bs4")
-
-from bs4 import BeautifulSoup
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
+from ..HtmlParser import htmlsax
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@@ -657,8 +651,7 @@ class UrlBase (object):
def get_content (self):
if self.text is None:
self.get_raw_content()
- self.soup = BeautifulSoup(self.data, "html.parser",
- multi_valued_attributes=None)
+ self.soup = htmlsax.make_soup(self.data)
self.text = self.data.decode(self.soup.original_encoding)
self.encoding = self.soup.original_encoding
return self.text
diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py
index 9419a6c4..eca99ed6 100644
--- a/linkcheck/htmlutil/formsearch.py
+++ b/linkcheck/htmlutil/formsearch.py
@@ -44,13 +44,10 @@ class FormFinder(object):
def __init__(self):
"""Initialize local variables."""
super(FormFinder, self).__init__()
- # parser object will be initialized when it is used as
- # a handler object
- self.parser = None
self.forms = []
self.form = None
- def start_element(self, tag, attrs, element_text=None):
+ def start_element(self, tag, attrs, element_text, lineno, column):
"""Does nothing, override in a subclass."""
if tag == u'form':
if u'action' in attrs:
@@ -69,10 +66,10 @@ class FormFinder(object):
log.warn(LOG_CHECK, "formless input %s" % attrs)
pass
- def start_end_element(self, tag, attrs, element_text=None):
+ def start_end_element(self, tag, attrs, element_text, lineno, column):
"""Delegate a combined start/end element (eg. ) to
the start_element method. Ignore the end element part."""
- self.start_element(tag, attrs, element_text)
+ self.start_element(tag, attrs, element_text, lineno, column)
def end_element(self, tag):
"""search for ending form values."""
@@ -87,13 +84,8 @@ def search_form(content, cgiuser, cgipassword):
"""
handler = FormFinder()
parser = htmlsax.parser(handler)
- handler.parser = parser
# parse
- parser.feed(content)
- parser.flush()
- # break cyclic dependencies
- handler.parser = None
- parser.handler = None
+ parser.feed_soup(htmlsax.make_soup(content))
log.debug(LOG_CHECK, "Found forms %s", handler.forms)
cginames = (cgiuser.lower(), cgipassword.lower())
for form in handler.forms:
diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py
index e5295817..b2ed61e6 100644
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@@ -104,18 +104,15 @@ class TagFinder (object):
def __init__ (self):
"""Initialize local variables."""
super(TagFinder, self).__init__()
- # parser object will be initialized when it is used as
- # a handler object
- self.parser = None
- def start_element (self, tag, attrs):
+ def start_element (self, tag, attrs, element_text, lineno, column):
"""Does nothing, override in a subclass."""
pass
- def start_end_element (self, tag, attrs, element_text=None):
+ def start_end_element (self, tag, attrs, element_text, lineno, column):
"""Delegate a combined start/end element (eg.
) to
the start_element method. Ignore the end element part."""
- self.start_element(tag, attrs, element_text)
+ self.start_element(tag, attrs, element_text, lineno, column)
class MetaRobotsFinder (TagFinder):
@@ -127,7 +124,7 @@ class MetaRobotsFinder (TagFinder):
log.debug(LOG_CHECK, "meta robots finder")
self.follow = self.index = True
- def start_element (self, tag, attrs, element_text=None):
+ def start_element (self, tag, attrs, element_text, lineno, column):
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
if tag == 'meta' and attrs.get('name') == 'robots':
val = attrs.get('content', u'').lower().split(u',')
@@ -177,10 +174,10 @@ class LinkFinder (TagFinder):
self.tags[tag].update(self.universal_attrs)
self.base_ref = u''
- def start_element (self, tag, attrs, element_text=None):
+ def start_element (self, tag, attrs, element_text, lineno, column):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
- log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column())
+ log.debug(LOG_CHECK, "line %d col %d", lineno, column)
if tag == "base" and not self.base_ref:
self.base_ref = attrs.get("href", u'')
tagattrs = self.tags.get(tag, self.universal_attrs)
@@ -205,7 +202,7 @@ class LinkFinder (TagFinder):
value = value.split(':', 1)[1]
value = 'dns:' + value.rstrip('/')
# parse tag for URLs
- self.parse_tag(tag, attr, value, name, base)
+ self.parse_tag(tag, attr, value, name, base, lineno, column)
log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)
def get_link_name (self, tag, attrs, attr, name=None):
@@ -221,7 +218,7 @@ class LinkFinder (TagFinder):
name = u""
return name
- def parse_tag (self, tag, attr, value, name, base):
+ def parse_tag (self, tag, attr, value, name, base, lineno, column):
"""Add given url data to url list."""
assert isinstance(tag, str_text), repr(tag)
assert isinstance(attr, str_text), repr(attr)
@@ -232,25 +229,24 @@ class LinkFinder (TagFinder):
if tag == u'meta' and value:
mo = refresh_re.match(value)
if mo:
- self.found_url(mo.group("url"), name, base)
+ self.found_url(mo.group("url"), name, base, lineno, column)
elif attr != 'content':
- self.found_url(value, name, base)
+ self.found_url(value, name, base, lineno, column)
elif attr == u'style' and value:
for mo in css_url_re.finditer(value):
url = unquote(mo.group("url"), matching=True)
- self.found_url(url, name, base)
+ self.found_url(url, name, base, lineno, column)
elif attr == u'archive':
for url in value.split(u','):
- self.found_url(url, name, base)
+ self.found_url(url, name, base, lineno, column)
elif attr == u'srcset':
for img_candidate in value.split(u','):
url = img_candidate.split()[0]
- self.found_url(url, name, base)
+ self.found_url(url, name, base, lineno, column)
else:
- self.found_url(value, name, base)
+ self.found_url(value, name, base, lineno, column)
- def found_url(self, url, name, base):
+ def found_url(self, url, name, base, lineno, column):
"""Add newly found URL to queue."""
assert isinstance(url, str_text) or url is None, repr(url)
- self.callback(url, line=self.parser.lineno(),
- column=self.parser.column(), name=name, base=base)
+ self.callback(url, line=lineno, column=column, name=name, base=base)
diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py
index dc3494fb..b35892a8 100644
--- a/linkcheck/parser/__init__.py
+++ b/linkcheck/parser/__init__.py
@@ -120,18 +120,13 @@ def find_links (url_data, callback, tags):
# construct parser object
handler = linkparse.LinkFinder(callback, tags)
parser = htmlsax.parser(handler)
- handler.parser = parser
# parse
try:
soup = url_data.get_soup()
parser.feed_soup(soup)
- parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
- # break cyclic dependencies
- handler.parser = None
- parser.handler = None
def parse_firefox (url_data):
diff --git a/tests/htmllib.py b/tests/htmllib.py
index 6f1c5b19..ab16ac46 100644
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@@ -15,50 +15,12 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
-Default HTML parser handler classes.
+HTML parser handler test class.
"""
import sys
-class HtmlPrinter:
- """
- Handles all functions by printing the function name and attributes.
- """
-
- def __init__ (self, fd=sys.stdout):
- """
- Write to given file descriptor.
-
- @param fd: file like object (default=sys.stdout)
- @type fd: file
- """
- self.fd = fd
-
- def _print (self, *attrs):
- """
- Print function attributes to stored file descriptor.
-
- @param attrs: list of values to print
- @type attrs: tuple
- @return: None
- """
- self.fd.write(self.mem)
- self.fd.write(str(attrs))
-
- def __getattr__ (self, name):
- """
- Remember the called method name in self.mem.
-
- @param name: attribute name
- @type name: string
- @return: method which just prints out its arguments
- @rtype: a bound function object
- """
- self.mem = name
- return self._print
-
-
class HtmlPrettyPrinter:
"""
Print out all parsed HTML data in encoded form.
@@ -77,7 +39,7 @@ class HtmlPrettyPrinter:
self.fd = fd
self.encoding = encoding
- def start_element (self, tag, attrs, element_text=None):
+ def start_element (self, tag, attrs, element_text, lineno, column):
"""
Print HTML start element.
@@ -89,7 +51,7 @@ class HtmlPrettyPrinter:
"""
self._start_element(tag, attrs, ">", element_text)
- def start_end_element (self, tag, attrs, element_text=None):
+ def start_end_element (self, tag, attrs, element_text, lineno, column):
"""
Print HTML start-end element.
@@ -101,7 +63,7 @@ class HtmlPrettyPrinter:
"""
self._start_element(tag, attrs, "/>", element_text)
- def _start_element (self, tag, attrs, end, element_text=None):
+ def _start_element (self, tag, attrs, end, element_text):
"""
Print HTML element with end string.
diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py
index 0b965af2..e0962f7b 100644
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@@ -20,7 +20,7 @@ Test linkparser routines.
import unittest
from linkcheck.htmlutil import linkparse
-import linkcheck.HtmlParser.htmlsax
+from linkcheck.HtmlParser import htmlsax
class TestLinkparser (unittest.TestCase):
@@ -31,15 +31,11 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
- p = linkcheck.HtmlParser.htmlsax.parser(h)
- h.parser = p
+ p = htmlsax.parser(h)
try:
- p.feed(content)
- p.flush()
+ p.feed_soup(htmlsax.make_soup(content))
except linkparse.StopParse:
pass
- h.parser = None
- p.handler = None
self.assertEqual(self.count_url, 1)
def _test_one_url (self, origurl):
@@ -53,15 +49,11 @@ class TestLinkparser (unittest.TestCase):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
- p = linkcheck.HtmlParser.htmlsax.parser(h)
- h.parser = p
+ p = htmlsax.parser(h)
try:
- p.feed(content)
- p.flush()
+ p.feed_soup(htmlsax.make_soup(content))
except linkparse.StopParse:
pass
- h.parser = None
- p.handler = None
def test_href_parsing (self):
# Test parsing.
diff --git a/tests/test_parser.py b/tests/test_parser.py
index fc831361..7e087082 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -18,14 +18,14 @@
Test html parsing.
"""
-import linkcheck.HtmlParser.htmlsax
+from linkcheck.HtmlParser import htmlsax
from io import StringIO
import unittest
from parameterized import parameterized
-from .htmllib import HtmlPrinter, HtmlPrettyPrinter
+from .htmllib import HtmlPrettyPrinter
# list of tuples
# (, )
@@ -137,70 +137,23 @@ class TestParser (unittest.TestCase):
Test html parser.
"""
- def setUp (self):
- """
- Initialize two internal html parsers to be used for testing.
- """
- self.htmlparser = linkcheck.HtmlParser.htmlsax.parser()
- self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser()
-
@parameterized.expand(parsetests)
def test_parse (self, _in, _out):
# Parse all test patterns in one go.
out = StringIO()
handler = HtmlPrettyPrinter(out)
- self.htmlparser.handler = handler
- self.htmlparser.feed(_in)
- self.check_results(self.htmlparser, _in, _out, out)
+ parser = htmlsax.parser(handler)
+ parser.feed_soup(htmlsax.make_soup(_in))
+ self.check_results(_in, _out, out)
- def check_results (self, htmlparser, _in, _out, out):
+ def check_results (self, _in, _out, out):
"""
Check parse results.
"""
- htmlparser.flush()
res = out.getvalue()
msg = "Test error; in: %r, out: %r, expect: %r" % \
(_in, res, _out)
self.assertEqual(res, _out, msg=msg)
- htmlparser.reset()
-
- @parameterized.expand(parsetests)
- def test_feed (self, _in, _out):
- # Parse all test patterns sequentially.
- out = StringIO()
- handler = HtmlPrettyPrinter(out)
- self.htmlparser.handler = handler
- for c in _in:
- self.htmlparser.feed(c)
- self.check_results(self.htmlparser, _in, _out, out)
-
- @parameterized.expand(parsetests)
- def test_interwoven (self, _in, _out):
- # Parse all test patterns on two parsers interwoven.
- out = StringIO()
- out2 = StringIO()
- handler = HtmlPrettyPrinter(out)
- self.htmlparser.handler = handler
- handler2 = HtmlPrettyPrinter(out2)
- self.htmlparser2.handler = handler2
- for c in _in:
- self.htmlparser.feed(c)
- self.htmlparser2.feed(c)
- self.check_results(self.htmlparser, _in, _out, out)
- self.check_results(self.htmlparser2, _in, _out, out2)
-
- @parameterized.expand(parsetests)
- def test_handler (self, _in, _out):
- out = StringIO()
- out2 = StringIO()
- handler = HtmlPrinter(out)
- self.htmlparser.handler = handler
- handler2 = HtmlPrinter(out2)
- self.htmlparser2.handler = handler2
- for c in _in:
- self.htmlparser.feed(c)
- self.htmlparser2.feed(c)
- self.assertEqual(out.getvalue(), out2.getvalue())
def test_encoding_detection_utf_content (self):
html = b''
@@ -227,11 +180,9 @@ class TestParser (unittest.TestCase):
self.encoding_test(html, "ascii")
def encoding_test (self, html, expected):
- parser = linkcheck.HtmlParser.htmlsax.parser()
- self.assertEqual(parser.encoding, None)
out = StringIO()
handler = HtmlPrettyPrinter(out)
- parser.handler = handler
- parser.feed(html)
- parser.flush()
- self.assertEqual(parser.encoding, expected)
+ parser = htmlsax.parser(handler)
+ soup = htmlsax.make_soup(html)
+ parser.feed_soup(soup)
+ self.assertEqual(soup.original_encoding, expected)