mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-17 06:20:27 +00:00
html parser: use name instead of peeking
This commit is contained in:
parent
51a06d8a1e
commit
d6d48b4814
3 changed files with 5 additions and 26 deletions
|
|
@ -89,7 +89,7 @@ class HtmlPrettyPrinter (object):
|
|||
"""
|
||||
self.fd.write("<!--%s-->" % data)
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
def start_element (self, tag, attrs, element_text=None):
|
||||
"""
|
||||
Print HTML start element.
|
||||
|
||||
|
|
|
|||
|
|
@ -23,8 +23,6 @@ from .. import strformat, log, LOG_CHECK, url as urlutil
|
|||
from . import linkname
|
||||
from builtins import str as str_text
|
||||
|
||||
MAX_NAMELEN = 256
|
||||
|
||||
unquote = strformat.unquote
|
||||
|
||||
# HTML4/5 link tags
|
||||
|
|
@ -130,7 +128,7 @@ class MetaRobotsFinder (TagFinder):
|
|||
log.debug(LOG_CHECK, "meta robots finder")
|
||||
self.follow = self.index = True
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
def start_element (self, tag, attrs, element_text=None):
|
||||
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
|
||||
if tag == 'meta' and attrs.get('name') == 'robots':
|
||||
val = attrs.get_true('content', u'').lower().split(u',')
|
||||
|
|
@ -180,7 +178,7 @@ class LinkFinder (TagFinder):
|
|||
self.tags[tag].update(self.universal_attrs)
|
||||
self.base_ref = u''
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
def start_element (self, tag, attrs, element_text=None):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
||||
|
|
@ -194,7 +192,7 @@ class LinkFinder (TagFinder):
|
|||
if tag == "form" and not is_form_get(attr, attrs):
|
||||
continue
|
||||
# name of this link
|
||||
name = self.get_link_name(tag, attrs, attr)
|
||||
name = self.get_link_name(tag, attrs, attr, element_text)
|
||||
# possible codebase
|
||||
base = u''
|
||||
if tag == 'applet':
|
||||
|
|
@ -211,13 +209,9 @@ class LinkFinder (TagFinder):
|
|||
self.parse_tag(tag, attr, value, name, base)
|
||||
log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)
|
||||
|
||||
def get_link_name (self, tag, attrs, attr):
|
||||
def get_link_name (self, tag, attrs, attr, name=None):
|
||||
"""Parse attrs for link name. Return name of link."""
|
||||
if tag == 'a' and attr == 'href':
|
||||
# Look for name only up to MAX_NAMELEN characters
|
||||
data = self.parser.peek(MAX_NAMELEN)
|
||||
data = data.decode(self.parser.encoding, "ignore")
|
||||
name = linkname.href_name(data)
|
||||
if not name:
|
||||
name = attrs.get_true('title', u'')
|
||||
elif tag == 'img':
|
||||
|
|
|
|||
|
|
@ -282,21 +282,6 @@ class TestParser (unittest.TestCase):
|
|||
self.assertEqual(resolve("&#%d;" % ord(c)), c)
|
||||
self.assertEqual(resolve("�"), u"")
|
||||
|
||||
def test_peek (self):
|
||||
# Test peek() parser function
|
||||
data = '<a href="test.html">name</a>'
|
||||
|
||||
class NamePeeker (object):
|
||||
|
||||
def start_element (self_handler, tag, attrs):
|
||||
# use self reference of TestParser instance
|
||||
self.assertRaises(TypeError, self.htmlparser.peek, -1)
|
||||
self.assertEqual(self.htmlparser.peek(0), "")
|
||||
self.assertEqual(self.htmlparser.peek(4), "name")
|
||||
|
||||
self.htmlparser.handler = NamePeeker()
|
||||
self.htmlparser.feed(data)
|
||||
|
||||
def test_encoding_detection (self):
|
||||
html = '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
|
||||
self.encoding_test(html, "utf-8")
|
||||
|
|
|
|||
Loading…
Reference in a new issue