diff --git a/linkcheck/HtmlParser/htmllib.py b/linkcheck/HtmlParser/htmllib.py index 75c6a4ec..0a8980e3 100644 --- a/linkcheck/HtmlParser/htmllib.py +++ b/linkcheck/HtmlParser/htmllib.py @@ -89,7 +89,7 @@ class HtmlPrettyPrinter (object): """ self.fd.write("" % data) - def start_element (self, tag, attrs): + def start_element (self, tag, attrs, element_text=None): """ Print HTML start element. diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index f2c2909d..20d45a38 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -23,8 +23,6 @@ from .. import strformat, log, LOG_CHECK, url as urlutil from . import linkname from builtins import str as str_text -MAX_NAMELEN = 256 - unquote = strformat.unquote # HTML4/5 link tags @@ -130,7 +128,7 @@ class MetaRobotsFinder (TagFinder): log.debug(LOG_CHECK, "meta robots finder") self.follow = self.index = True - def start_element (self, tag, attrs): + def start_element (self, tag, attrs, element_text=None): """Search for meta robots.txt "nofollow" and "noindex" flags.""" if tag == 'meta' and attrs.get('name') == 'robots': val = attrs.get_true('content', u'').lower().split(u',') @@ -180,7 +178,7 @@ class LinkFinder (TagFinder): self.tags[tag].update(self.universal_attrs) self.base_ref = u'' - def start_element (self, tag, attrs): + def start_element (self, tag, attrs, element_text=None): """Search for links and store found URLs in a list.""" log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) @@ -194,7 +192,7 @@ class LinkFinder (TagFinder): if tag == "form" and not is_form_get(attr, attrs): continue # name of this link - name = self.get_link_name(tag, attrs, attr) + name = self.get_link_name(tag, attrs, attr, element_text) # possible codebase base = u'' if tag == 'applet': @@ -211,13 +209,9 @@ class LinkFinder (TagFinder): self.parse_tag(tag, attr, value, name, base) log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag) - def get_link_name (self, tag, attrs, attr): + def get_link_name (self, tag, attrs, attr, name=None): """Parse attrs for link name. Return name of link.""" if tag == 'a' and attr == 'href': - # Look for name only up to MAX_NAMELEN characters - data = self.parser.peek(MAX_NAMELEN) - data = data.decode(self.parser.encoding, "ignore") - name = linkname.href_name(data) if not name: name = attrs.get_true('title', u'') elif tag == 'img': diff --git a/tests/test_parser.py b/tests/test_parser.py index 49b40f0d..0851b3c6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -282,21 +282,6 @@ class TestParser (unittest.TestCase): self.assertEqual(resolve("&#%d;" % ord(c)), c) self.assertEqual(resolve("�"), u"") - def test_peek (self): - # Test peek() parser function - data = 'name' - - class NamePeeker (object): - - def start_element (self_handler, tag, attrs): - # use self reference of TestParser instance - self.assertRaises(TypeError, self.htmlparser.peek, -1) - self.assertEqual(self.htmlparser.peek(0), "") - self.assertEqual(self.htmlparser.peek(4), "name") - - self.htmlparser.handler = NamePeeker() - self.htmlparser.feed(data) - def test_encoding_detection (self): html = '' self.encoding_test(html, "utf-8")