html parser: use name instead of peeking

This commit is contained in:
Petr Dlouhý 2019-07-22 19:59:37 +01:00 committed by Chris Mayo
parent 51a06d8a1e
commit d6d48b4814
3 changed files with 5 additions and 26 deletions

View file

@ -89,7 +89,7 @@ class HtmlPrettyPrinter (object):
"""
self.fd.write("<!--%s-->" % data)
def start_element (self, tag, attrs):
def start_element (self, tag, attrs, element_text=None):
"""
Print HTML start element.

View file

@ -23,8 +23,6 @@ from .. import strformat, log, LOG_CHECK, url as urlutil
from . import linkname
from builtins import str as str_text
MAX_NAMELEN = 256
unquote = strformat.unquote
# HTML4/5 link tags
@ -130,7 +128,7 @@ class MetaRobotsFinder (TagFinder):
log.debug(LOG_CHECK, "meta robots finder")
self.follow = self.index = True
def start_element (self, tag, attrs):
def start_element (self, tag, attrs, element_text=None):
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
if tag == 'meta' and attrs.get('name') == 'robots':
val = attrs.get_true('content', u'').lower().split(u',')
@ -180,7 +178,7 @@ class LinkFinder (TagFinder):
self.tags[tag].update(self.universal_attrs)
self.base_ref = u''
def start_element (self, tag, attrs):
def start_element (self, tag, attrs, element_text=None):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
@ -194,7 +192,7 @@ class LinkFinder (TagFinder):
if tag == "form" and not is_form_get(attr, attrs):
continue
# name of this link
name = self.get_link_name(tag, attrs, attr)
name = self.get_link_name(tag, attrs, attr, element_text)
# possible codebase
base = u''
if tag == 'applet':
@ -211,13 +209,9 @@ class LinkFinder (TagFinder):
self.parse_tag(tag, attr, value, name, base)
log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)
def get_link_name (self, tag, attrs, attr):
def get_link_name (self, tag, attrs, attr, name=None):
"""Parse attrs for link name. Return name of link."""
if tag == 'a' and attr == 'href':
# Look for name only up to MAX_NAMELEN characters
data = self.parser.peek(MAX_NAMELEN)
data = data.decode(self.parser.encoding, "ignore")
name = linkname.href_name(data)
if not name:
name = attrs.get_true('title', u'')
elif tag == 'img':

View file

@ -282,21 +282,6 @@ class TestParser (unittest.TestCase):
self.assertEqual(resolve("&#%d;" % ord(c)), c)
self.assertEqual(resolve("&#1114112;"), u"")
def test_peek (self):
# Test peek() parser function
data = '<a href="test.html">name</a>'
class NamePeeker (object):
def start_element (self_handler, tag, attrs):
# use self reference of TestParser instance
self.assertRaises(TypeError, self.htmlparser.peek, -1)
self.assertEqual(self.htmlparser.peek(0), "")
self.assertEqual(self.htmlparser.peek(4), "name")
self.htmlparser.handler = NamePeeker()
self.htmlparser.feed(data)
def test_encoding_detection (self):
html = '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
self.encoding_test(html, "utf-8")