html parser: use name instead of peeking

2026-05-07 22:24:45 +00:00 · 2019-07-22 19:59:37 +01:00 · 2019-07-22 19:59:37 +01:00 · d6d48b4814
commit d6d48b4814
parent 51a06d8a1e
3 changed files with 5 additions and 26 deletions
--- a/linkcheck/HtmlParser/htmllib.py
+++ b/linkcheck/HtmlParser/htmllib.py
@ -89,7 +89,7 @@ class HtmlPrettyPrinter (object):
        """
        self.fd.write("<!--%s-->" % data)

-    def start_element (self, tag, attrs):
+    def start_element (self, tag, attrs, element_text=None):
        """
        Print HTML start element.

--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -23,8 +23,6 @@ from .. import strformat, log, LOG_CHECK, url as urlutil
 from . import linkname
 from builtins import str as str_text

-MAX_NAMELEN = 256
-
 unquote = strformat.unquote

 # HTML4/5 link tags
@ -130,7 +128,7 @@ class MetaRobotsFinder (TagFinder):
        log.debug(LOG_CHECK, "meta robots finder")
        self.follow = self.index = True

-    def start_element (self, tag, attrs):
+    def start_element (self, tag, attrs, element_text=None):
        """Search for meta robots.txt "nofollow" and "noindex" flags."""
        if tag == 'meta' and attrs.get('name') == 'robots':
            val = attrs.get_true('content', u'').lower().split(u',')
@ -180,7 +178,7 @@ class LinkFinder (TagFinder):
            self.tags[tag].update(self.universal_attrs)
        self.base_ref = u''

-    def start_element (self, tag, attrs):
+    def start_element (self, tag, attrs, element_text=None):
        """Search for links and store found URLs in a list."""
        log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
        log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
@ -194,7 +192,7 @@ class LinkFinder (TagFinder):
            if tag == "form" and not is_form_get(attr, attrs):
                continue
            # name of this link
-            name = self.get_link_name(tag, attrs, attr)
+            name = self.get_link_name(tag, attrs, attr, element_text)
            # possible codebase
            base = u''
            if tag  == 'applet':
@ -211,13 +209,9 @@ class LinkFinder (TagFinder):
            self.parse_tag(tag, attr, value, name, base)
        log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)

-    def get_link_name (self, tag, attrs, attr):
+    def get_link_name (self, tag, attrs, attr, name=None):
        """Parse attrs for link name. Return name of link."""
        if tag == 'a' and attr == 'href':
-            # Look for name only up to MAX_NAMELEN characters
-            data = self.parser.peek(MAX_NAMELEN)
-            data = data.decode(self.parser.encoding, "ignore")
-            name = linkname.href_name(data)
            if not name:
                name = attrs.get_true('title', u'')
        elif tag == 'img':
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -282,21 +282,6 @@ class TestParser (unittest.TestCase):
            self.assertEqual(resolve("&#%d;" % ord(c)), c)
        self.assertEqual(resolve("&#1114112;"), u"")

-    def test_peek (self):
-        # Test peek() parser function
-        data = '<a href="test.html">name</a>'
-
-        class NamePeeker (object):
-
-            def start_element (self_handler, tag, attrs):
-                # use self reference of TestParser instance
-                self.assertRaises(TypeError, self.htmlparser.peek, -1)
-                self.assertEqual(self.htmlparser.peek(0), "")
-                self.assertEqual(self.htmlparser.peek(4), "name")
-
-        self.htmlparser.handler = NamePeeker()
-        self.htmlparser.feed(data)
-
    def test_encoding_detection (self):
        html = '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
        self.encoding_test(html, "utf-8")