Merge pull request #351 from cjmayo/tagsonly

Remove support for non-Tag elements from Parser
2026-05-08 06:34:50 +00:00 · 2020-04-01 12:17:18 -04:00 · 2020-04-01 12:17:18 -04:00 · cf4e6bb235
commit cf4e6bb235
parent 7c14bf1ad6 9fc651e82b
4 changed files with 24 additions and 205 deletions
--- a/linkcheck/HtmlParser/init.py
+++ b/linkcheck/HtmlParser/init.py
@ -26,11 +26,6 @@ in the ListDict (ie. "<a href>" with lead to a {href: None} dict entry).

 Used callbacks of a handler are:

- Comments: <!--data-->
-  def comment (data)
-  @param data:
-  @type data: Unicode string
-
 - Start tag: <tag {attr1:value1, attr2:value2, ..}>
  def start_element (tag, attrs)
  @param tag: tag name
@ -50,28 +45,6 @@ Used callbacks of a handler are:
  @param tag: tag name
  @type tag: Unicode string

- Document type: <!DOCTYPE data>
-  def doctype (data)
-  @param data: doctype string data
-  @type data: Unicode string
-
- Processing instruction (PI): <?name data?>
-  def pi (name, data=None)
-  @param name: instruction name
-  @type name: Unicode string
-  @param data: instruction data
-  @type data: Unicode string
-
- Character data: <![CDATA[data]]>
-  def cdata (data)
-  @param data: character data
-  @type data: Unicode string
-
- Characters: data
-  def characters(data): data
-  @param data: data
-  @type data: Unicode string
-
 Additionally, there are error and warning callbacks:

 - Parser warning.
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@ -24,8 +24,7 @@ filterwarnings("ignore",
    message="The soupsieve package is not installed. CSS selectors cannot be used.",
    category=UserWarning, module="bs4")

-from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
-                 Tag)
+from bs4 import BeautifulSoup, Tag

 from ..containers import ListDict

@ -87,28 +86,6 @@ class Parser(object):
                        self.parse_contents(content.contents)
                    if hasattr(self.handler, 'end_element'):
                        self.handler.end_element(content.name)
-                if content.comments:
-                    for comment in content.comments:
-                        if hasattr(self.handler, 'comment'):
-                            self.handler.comment(comment)
-            elif isinstance(content, Doctype):
-                if hasattr(self.handler, 'doctype'):
-                    self.handler.doctype(
-                        content[len('DOCTYPE '):]
-                        if content.upper().startswith('DOCTYPE ')
-                        else content)
-            elif isinstance(content, Comment):
-                if hasattr(self.handler, 'comment'):
-                    self.handler.comment(content.strip())
-            elif isinstance(content, CData):
-                if hasattr(self.handler, 'cdata'):
-                    self.handler.cdata(content)
-            elif isinstance(content, ProcessingInstruction):
-                if hasattr(self.handler, 'pi'):
-                    self.handler.pi(content.strip("? "))
-            else:
-                if hasattr(self.handler, 'characters'):
-                    self.handler.characters(content)

    def flush(self):
        if self.soup is None:
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@ -19,11 +19,9 @@ Default HTML parser handler classes.
 """

 import sys
-from builtins import bytes, str as str_text
-from builtins import chr


-class HtmlPrinter (object):
+class HtmlPrinter:
    """
    Handles all functions by printing the function name and attributes.
    """
@ -46,7 +44,7 @@ class HtmlPrinter (object):
        @return: None
        """
        self.fd.write(self.mem)
-        self.fd.write(str_text(attrs))
+        self.fd.write(str(attrs))

    def __getattr__ (self, name):
        """
@ -61,7 +59,7 @@ class HtmlPrinter (object):
        return self._print


-class HtmlPrettyPrinter (object):
+class HtmlPrettyPrinter:
    """
    Print out all parsed HTML data in encoded form.
    Also stores error and warnings messages.
@ -79,16 +77,6 @@ class HtmlPrettyPrinter (object):
        self.fd = fd
        self.encoding = encoding

-    def comment (self, data):
-        """
-        Print HTML comment.
-
-        @param data: the comment
-        @type data: string
-        @return: None
-        """
-        self.fd.write("<!-- %s -->" % data)
-
    def start_element (self, tag, attrs, element_text=None):
        """
        Print HTML start element.
@ -99,7 +87,7 @@ class HtmlPrettyPrinter (object):
        @type attrs: dict
        @return: None
        """
-        self._start_element(tag, attrs, u">")
+        self._start_element(tag, attrs, ">", element_text)

    def start_end_element (self, tag, attrs, element_text=None):
        """
@ -111,9 +99,9 @@ class HtmlPrettyPrinter (object):
        @type attrs: dict
        @return: None
        """
-        self._start_element(tag, attrs, u"/>")
+        self._start_element(tag, attrs, "/>", element_text)

-    def _start_element (self, tag, attrs, end):
+    def _start_element (self, tag, attrs, end, element_text=None):
        """
        Print HTML element with end string.

@ -125,13 +113,15 @@ class HtmlPrettyPrinter (object):
        @type end: string
        @return: None
        """
-        self.fd.write(u"<%s" % tag.replace("/", ""))
+        self.fd.write("<%s" % tag.replace("/", ""))
        for key, val in attrs.items():
            if val is None:
-                self.fd.write(u" %s" % key)
+                self.fd.write(" %s" % key)
            else:
-                self.fd.write(u' %s="%s"' % (key, quote_attrval(val)))
+                self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
        self.fd.write(end)
+        if element_text:
+            self.fd.write(element_text)

    def end_element (self, tag):
        """
@ -143,46 +133,6 @@ class HtmlPrettyPrinter (object):
        """
        self.fd.write("</%s>" % tag)

-    def doctype (self, data):
-        """
-        Print HTML document type.
-
-        @param data: the document type
-        @type data: string
-        @return: None
-        """
-        self.fd.write("<!DOCTYPE %s>" % data)
-
-    def pi (self, data):
-        """
-        Print HTML pi.
-
-        @param data: the tag data
-        @type data: string
-        @return: None
-        """
-        self.fd.write("<?%s?>" % data)
-
-    def cdata (self, data):
-        """
-        Print HTML cdata.
-
-        @param data: the character data
-        @type data: string
-        @return: None
-        """
-        self.fd.write("<![CDATA[%s]]>" % data)
-
-    def characters (self, data):
-        """
-        Print characters.
-
-        @param data: the character data
-        @type data: string
-        @return: None
-        """
-        self.fd.write(data)
-

 def quote_attrval (s):
    """
@ -195,18 +145,14 @@ def quote_attrval (s):
    """
    res = []
    for c in s:
-        try:  # Python 2
-            ord_c = ord(c)
-        except TypeError:
-            ord_c = c
-        if ord_c <= 127:
+        if ord(c) <= 127:
            # ASCII
-            if c == u'&':
-                res.append(u"&amp;")
-            elif c == u'"':
-                res.append(u"&quot;")
+            if c == '&':
+                res.append("&amp;")
+            elif c == '"':
+                res.append("&quot;")
            else:
-                res.append(chr(ord_c))
+                res.append(c)
        else:
-            res.append(u"&#%d;" % ord_c)
-    return u"".join(res)
+            res.append("&#%d;" % ord(c))
+    return "".join(res)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -20,10 +20,7 @@ Test html parsing.

 import linkcheck.HtmlParser.htmlsax

-try:
-    from cStringIO import StringIO
-except ImportError:
-    from io import StringIO
+from io import StringIO
 import unittest

 from parameterized import parameterized
@ -38,7 +35,6 @@ parsetests = [
    ("""<a  b='c' >""", """<a b="c"></a>"""),
    ("""<a  b=c" >""", """<a b="c&quot;"></a>"""),
    ("""<a  b=c' >""", """<a b="c'"></a>"""),
-    ("""<a  b="c >""", """<a  b="c >"""),
    ("""<a  b="" >""", """<a b=""></a>"""),
    ("""<a  b='' >""", """<a b=""></a>"""),
    ("""<a  b=>""", """<a b=""></a>"""),
@ -51,10 +47,8 @@ parsetests = [
    ("""<a  b ="c" >""", """<a b="c"></a>"""),
    ("""<a  b = "c" >""", """<a b="c"></a>"""),
    ("""<a >""", """<a></a>"""),
-    ("""< a>""", """< a>"""),
-    ("""< a >""", """< a >"""),
-    ("""<>""", """<>"""),
-    ("""< >""", """< >"""),
+    ("""<>""", """"""),
+    ("""< >""", """"""),
    ("""<aä>""", u"""<aä></aä>"""),
    ("""<a aä="b">""", u"""<a aä="b"></a>"""),
    ("""<a a="bä">""", u"""<a a="b&#228;"></a>"""),
@ -64,7 +58,6 @@ parsetests = [
    ("""<a b="c" b="d" >""", """<a b="d"></a>"""),
    # reduce test
    ("""<a  b="c"><""", """<a b="c"><</a>"""),
-    ("""d>""", """d>"""),
    # numbers in tag
    ("""<h1>bla</h1>""", """<h1>bla</h1>"""),
    # more start tags
@ -72,49 +65,10 @@ parsetests = [
    ("""<a  b=/c/></a><br>""", """<a b="/c/"></a><br/>"""),
    ("""<br/>""", """<br/>"""),
    ("""<a  b="50%"><br>""", """<a b="50%"><br/></a>"""),
-    # comments
-    ("""<!---->< 1>""", """<!--  -->< 1>"""),
-    ("""<!-- a - b -->< 2>""", """<!-- a - b -->< 2>"""),
-    ("""<!----->< 3>""", """<!-- - -->< 3>"""),
-    ("""<!------>< 4>""", """<!-- -- -->< 4>"""),
-    ("""<!------->< 5>""", """<!-- --- -->< 5>"""),
-    ("""<!-- -->< 7>""", """<!--  -->< 7>"""),
-    ("""<!---- />-->""", """<!-- -- /> -->"""),
-    ("""<!-- a-2 -->< 9>""", """<!-- a-2 -->< 9>"""),
-    ("""<!-- --- -->< 10>""", """<!-- --- -->< 10>"""),
-    ("""<!>""", """<!--  -->"""), # empty comment
-    # invalid comments
-    ("""<!-- -- >< 8>""", """<!--  -->< 8>"""),
-    ("""<!---- >< 6>""", """<!--  -->< 6>"""),
-    ("""<!- blubb ->""", """<!-- - blubb - -->"""),
-    ("""<! -- blubb -->""", """<!-- -- blubb -- -->"""),
-    ("""<!-- blubb -- >""", """<!-- blubb -->"""),
-    ("""<! blubb !>< a>""", """<!-- blubb ! -->< a>"""),
-    ("""<! blubb >< a>""", """<!-- blubb -->< a>"""),
-    # end tags
-    ("""</a>""", """"""),
-    ("""</ a>""", """"""),
-    ("""</ a >""", """"""),
-    ("""</a >""", """"""),
-    ("""< / a>""", """< / a>"""),
-    ("""< /a>""", """< /a>"""),
-    ("""</aä>""", """"""),
    # start and end tag (HTML doctype assumed)
    ("""<a/>""", """<a></a>"""),
    ("""<meta/>""", """<meta/>"""),
    ("""<MetA/>""", """<meta/>"""),
-    # declaration tags
-    ("""<!DOCtype adrbook SYSTEM "adrbook.dtd">""",
-     """<!DOCTYPE adrbook SYSTEM "adrbook.dtd">"""),
-    # misc
-    ("""<?xmL version="1.0" encoding="latin1"?>""",
-     """<?xmL version="1.0" encoding="latin1"?>"""),
-    # javascript
-    ("""<script >\n</script>""", """<script>\n</script>"""),
-    ("""<sCrIpt lang="a">bla </a> fasel</scripT>""",
-     """<script lang="a">bla </a> fasel</script>"""),
-    ("""<script ><!--bla//-->// </script >""",
-     """<script><!--bla//-->// </script>"""),
    # line continuation (Dr. Fun webpage)
    ("""<img bo\\\nrder=0 >""", """<img bo\\="" rder="0"/>"""),
    ("""<img align="mid\\\ndle">""", """<img align="mid\\\ndle"/>"""),
@ -144,15 +98,9 @@ parsetests = [
    ("""<a  href="&#x6D;ailto:" >""", """<a href="mailto:"></a>"""),
    # note that \u8156 is not valid encoding and therefore gets removed
    ("""<a  href="&#8156;ailto:" >""", """<a href="&#8156;ailto:"></a>"""),
-    # non-ascii characters
-    ("""<Üzgür> fahr </langsamer> żżżżżż{""",
-     u"""<Üzgür> fahr  żżżżżż{"""),
    # mailto link
    ("""<a  href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>""",
     """<a href="mailto:calvin@LocalHost?subject=Hallo&amp;to=michi">1</a>"""),
-    # doctype XHTML
-    ("""<!DOCTYPe html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><MeTa a="b"/>""",
-     """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><meta a="b"/>"""),
    # meta tag with charset encoding
    ("""<meta http-equiv="content-type" content>""",
     """<meta content="" http-equiv="content-type"/>"""),
@ -164,22 +112,13 @@ parsetests = [
     """<meta content="text/html; charset=iso8859-1" http-equiv="content-type"/>"""),
    ("""<meta http-equiv="content-type" content="text/html; charset=hulla">""",
     """<meta content="text/html; charset=hulla" http-equiv="content-type"/>"""),
-    # CDATA
-    ("""<![CDATA[<a>hallo</a>]]>""", """<![CDATA[<a>hallo</a>]]>"""),
    # missing > in end tag
    ("""</td <td  a="b" >""", """"""),
    ("""</td<td  a="b" >""", """"""),
    # missing beginning quote
    ("""<td a=b">""", """<td a="b&quot;"></td>"""),
    # stray < before start tag
-    ("""<0.<td  a="b" >""", """<0.<td a="b"></td>"""),
-    # stray < before end tag
-    ("""<0.</td >""", """<0."""),
-    # missing end quote (XXX TODO)
-    #("""<td a="b>\n""", """<td a="b">\n"""),
-    #("""<td a="b></td>\na""", """<td a="b"></td>\na"""),
-    #("""<a  b="c><a b="c>\n""", """<a b="c"><a b="c">\n"""),
-    #("""<td a="b c="d"></td>\n""", """<td a="b" c="d"></td>\n"""),
+    ("""<0.<td  a="b" >""", """<td a="b"></td>"""),
    # HTML5 tags
    ("""<audio  src=bla>""", """<audio src="bla"></audio>"""),
    ("""<button  formaction=bla>""", """<button formaction="bla"></button>"""),
@ -192,13 +131,6 @@ parsetests = [
    ("""<a></a><b></b>""", """<a></a><b></b>"""),
 ]

-flushtests = [
-    ("<", "<"),
-    ("<a", "<a"),
-    ("<!a", "<!a"),
-    ("<?a", "<?a"),
-]
-

 class TestParser (unittest.TestCase):
    """
@ -270,15 +202,6 @@ class TestParser (unittest.TestCase):
            self.htmlparser2.feed(c)
        self.assertEqual(out.getvalue(), out2.getvalue())

-    @parameterized.expand(flushtests)
-    def test_flush (self, _in, _out):
-        # Test parser flushing.
-        out = StringIO()
-        handler = HtmlPrettyPrinter(out)
-        self.htmlparser.handler = handler
-        self.htmlparser.feed(_in)
-        self.check_results(self.htmlparser, _in, _out, out)
-
    def test_encoding_detection_utf_content (self):
        html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
        self.encoding_test(html, "utf-8")