Merge pull request #364 from cjmayo/parser5

Stop using HTML handlers and improve login form error handling
2026-05-25 06:33:43 +00:00 · 2020-04-30 09:28:48 -04:00 · 2020-04-30 09:28:48 -04:00 · ab476fa4bf
commit ab476fa4bf
parent 19d683bca5 1d1d9c3bde
10 changed files with 90 additions and 153 deletions
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -26,11 +26,12 @@ import warnings
 warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)

 from io import BytesIO
+import re

 from .. import (log, LOG_CHECK, strformat, mimeutil,
    url as urlutil, LinkCheckerError, httputil)
 from . import (internpaturl, proxysupport)
-from ..htmlutil import htmlsoup, linkparse
+from ..htmlutil import htmlsoup
 # import warnings
 from .const import WARN_HTTP_EMPTY_CONTENT
 from requests.sessions import REDIRECT_STATI
@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://')
 # helper alias
 unicode_safe = strformat.unicode_safe

+# match for robots meta element content attribute
+nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
+
 class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
    """
    Url link with http scheme.
@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """
        if not self.is_html():
            return True
-        # construct handler object
-        handler = linkparse.MetaRobotsFinder()
-        # parse
-        try:
-            htmlsoup.process_soup(handler, self.get_soup())
-        except linkparse.StopParse as msg:
-            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
-            pass
-        return handler.follow
+
+        soup = self.get_soup()
+        return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re})

    def add_size_info (self):
        """Get size of URL content from HTTP header."""
--- a/linkcheck/htmlutil/htmlsoup.py
+++ b/linkcheck/htmlutil/htmlsoup.py
@ -15,35 +15,6 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
 HTML parser implemented using Beautiful Soup and html.parser.
-
-USAGE
-
-Two functions are provided, one to make a BeautifulSoup object from markup and
-another to call a handler's callbacks for each element in a BeautifulSoup
-object it can process.
-
-The used callback of a handler is:
-
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
-  def start_element (tag, attrs, text, line, column)
-  @param tag: tag name
-  @type tag: string
-  @param attrs: tag attributes
-  @type attrs: dict
-  @param text: element text
-  @type tag: string
-  @param line: tag line number
-  @type tag: integer
-  @param column: tag column number
-  @type tag: integer
-
-EXAMPLE
-
- # Create a new BeautifulSoup object.
- soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
- # Process the soup with the chosen handler as a parameter.
- htmlutil.htmlsoup.proces_soup(handler, soup)
-
 """

 from warnings import filterwarnings
@ -58,10 +29,3 @@ from bs4 import BeautifulSoup
 def make_soup(markup, from_encoding=None):
    return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
                         multi_valued_attributes=None)
-
-def process_soup(handler, soup):
-    for element in soup.find_all(True):
-        handler.start_element(
-            element.name, element.attrs, element.text.strip(),
-            element.sourceline,
-            None if element.sourcepos is None else element.sourcepos + 1)
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -30,17 +30,17 @@ LinkTags = {
    'a':        [u'href'],
    'applet':   [u'archive', u'src'],
    'area':     [u'href'],
-    'audio':    [u'src'], # HTML5
+    'audio':    [u'src'],  # HTML5
    'bgsound':  [u'src'],
    'blockquote': [u'cite'],
    'body':     [u'background'],
-    'button':   [u'formaction'], # HTML5
+    'button':   [u'formaction'],  # HTML5
    'del':      [u'cite'],
    'embed':    [u'pluginspage', u'src'],
    'form':     [u'action'],
    'frame':    [u'src', u'longdesc'],
    'head':     [u'profile'],
-    'html':     [u'manifest'], # HTML5
+    'html':     [u'manifest'],  # HTML5
    'iframe':   [u'src', u'longdesc'],
    'ilayer':   [u'background'],
    'img':      [u'src', u'lowsrc', u'longdesc', u'usemap', u'srcset'],
@ -53,13 +53,13 @@ LinkTags = {
    'object':   [u'classid', u'data', u'archive', u'usemap', u'codebase'],
    'q':        [u'cite'],
    'script':   [u'src'],
-    'source':   [u'src'], # HTML5
+    'source':   [u'src'],  # HTML5
    'table':    [u'background'],
    'td':       [u'background'],
    'th':       [u'background'],
    'tr':       [u'background'],
-    'track':    [u'src'], # HTML5
-    'video':    [u'src'], # HTML5
+    'track':    [u'src'],  # HTML5
+    'video':    [u'src'],  # HTML5
    'xmp':      [u'href'],
    None:       [u'style', u'itemtype'],
 }
@ -98,44 +98,6 @@ def strip_c_comments (text):
    return c_comment_re.sub('', text)


-class StopParse(Exception):
-    """Raised when parsing should stop."""
-    pass
-
-
-class TagFinder (object):
-    """Base class handling HTML start elements.
-    TagFinder instances are used as HTML parser handlers."""
-
-    def __init__ (self):
-        """Initialize local variables."""
-        super(TagFinder, self).__init__()
-
-    def start_element (self, tag, attrs, element_text, lineno, column):
-        """Does nothing, override in a subclass."""
-        pass
-
-
-class MetaRobotsFinder (TagFinder):
-    """Class for finding robots.txt meta values in HTML."""
-
-    def __init__ (self):
-        """Initialize follow and index flags."""
-        super(MetaRobotsFinder, self).__init__()
-        log.debug(LOG_CHECK, "meta robots finder")
-        self.follow = self.index = True
-
-    def start_element (self, tag, attrs, element_text, lineno, column):
-        """Search for meta robots.txt "nofollow" and "noindex" flags."""
-        if tag == 'meta' and attrs.get('name') == 'robots':
-            val = attrs.get('content', u'').lower().split(u',')
-            self.follow = u'nofollow' not in val
-            self.index = u'noindex' not in val
-            raise StopParse("found <meta name=robots> tag")
-        elif tag == 'body':
-            raise StopParse("found <body> tag")
-
-
 def is_meta_url (attr, attrs):
    """Check if the meta attributes contain a URL."""
    res = False
@ -158,24 +120,23 @@ def is_form_get(attr, attrs):
    return res


-class LinkFinder (TagFinder):
+class LinkFinder:
    """Find HTML links, and apply them to the callback function with the
    format (url, lineno, column, name, codebase)."""

    def __init__ (self, callback, tags):
        """Store content in buffer and initialize URL list."""
-        super(LinkFinder, self).__init__()
        self.callback = callback
        # set universal tag attributes using tagname None
        self.universal_attrs = set(tags.get(None, []))
        self.tags = dict()
-        for  tag, attrs in tags.items():
+        for tag, attrs in tags.items():
            self.tags[tag] = set(attrs)
            # add universal tag attributes
            self.tags[tag].update(self.universal_attrs)
        self.base_ref = u''

-    def start_element (self, tag, attrs, element_text, lineno, column):
+    def html_element (self, tag, attrs, element_text, lineno, column):
        """Search for links and store found URLs in a list."""
        log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
        log.debug(LOG_CHECK, "line %d col %d", lineno, column)
@ -192,7 +153,7 @@ class LinkFinder (TagFinder):
            name = self.get_link_name(tag, attrs, attr, element_text)
            # possible codebase
            base = u''
-            if tag  == 'applet':
+            if tag == 'applet':
                base = attrs.get('codebase', u'')
            if not base:
                base = self.base_ref
@ -251,3 +212,15 @@ class LinkFinder (TagFinder):
        """Add newly found URL to queue."""
        assert isinstance(url, str_text) or url is None, repr(url)
        self.callback(url, line=lineno, column=column, name=name, base=base)
+
+
+def find_links(soup, callback, tags):
+    """Parse into content and search for URLs to check.
+    When a URL is found it is passed to the supplied callback.
+    """
+    lf = LinkFinder(callback, tags)
+    for element in soup.find_all(True):
+        lf.html_element(
+            element.name, element.attrs, element.text.strip(),
+            element.sourceline,
+            None if element.sourcepos is None else element.sourcepos + 1)
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -18,7 +18,7 @@
 Main functions for link parsing
 """
 from .. import log, LOG_CHECK, strformat, url as urlutil
-from ..htmlutil import htmlsoup, linkparse
+from ..htmlutil import linkparse
 from ..bookmarks import firefox


@ -46,7 +46,7 @@ def parse_html (url_data):
    """Parse into HTML content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
-    find_links(url_data, url_data.add_url, linkparse.LinkTags)
+    linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.LinkTags)


 def parse_opera (url_data):
@ -112,15 +112,7 @@ def parse_wml (url_data):
    """Parse into WML content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
-    find_links(url_data, url_data.add_url, linkparse.WmlTags)
-
-
-def find_links (url_data, callback, tags):
-    """Parse into content and search for URLs to check.
-    Found URLs are added to the URL queue.
-    """
-    handler = linkparse.LinkFinder(callback, tags)
-    htmlsoup.process_soup(handler, url_data.get_soup())
+    linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.WmlTags)


 def parse_firefox (url_data):
--- a/linkcheck/plugins/anchorcheck.py
+++ b/linkcheck/plugins/anchorcheck.py
@ -22,7 +22,6 @@ from urllib import parse
 from . import _ContentPlugin
 from .. import log, LOG_PLUGIN
 from ..htmlutil import linkparse
-from ..parser import find_links


 class AnchorCheck(_ContentPlugin):
@ -37,7 +36,8 @@ class AnchorCheck(_ContentPlugin):
        log.debug(LOG_PLUGIN, "checking content for invalid anchors")
        # list of parsed anchors
        self.anchors = []
-        find_links(url_data, self.add_anchor, linkparse.AnchorTags)
+        linkparse.find_links(url_data.get_soup(), self.add_anchor,
+                             linkparse.AnchorTags)
        self.check_anchor(url_data)

    def add_anchor (self, url, line, column, name, base):
--- a/tests/checker/data/norobots.html
+++ b/tests/checker/data/norobots.html
@ -1,2 +1,2 @@
-<meta name="robots" content="nofollow">
+<meta name="robots" content="noindex, Nofollow">
 <a href="do_not_check.html">bla</a>
--- a/tests/checker/test_content_allows_robots.py
+++ b/tests/checker/test_content_allows_robots.py
@ -16,10 +16,19 @@
 """
 Test that <meta name="robots" content="nofollow"> is respected when using http
 and ignored when checking a local file.
+Also test different values of the content attribute are correctly matched.
 """
+import unittest
+
+import linkcheck.configuration
+import linkcheck.director
+from linkcheck.htmlutil.htmlsoup import make_soup
+from . import get_url_from
+
 from . import LinkCheckTest
 from .httpserver import HttpServerTest

+
 class TestHttpMetaRobots(HttpServerTest):
    """Test <meta name="robots" content="nofollow"> using http."""

@ -33,6 +42,7 @@ class TestHttpMetaRobots(HttpServerTest):
        ]
        self.direct(url, resultlines, recursionlevel=1)

+
 class TestFileMetaRobots(LinkCheckTest):
    """Test <meta name="robots" content="nofollow"> from a file."""

@ -52,3 +62,23 @@ class TestFileMetaRobots(LinkCheckTest):
            "error"
        ]
        self.direct(url, resultlines, recursionlevel=1)
+
+
+class TestMetaRobotsVariants(unittest.TestCase):
+    """Test different values of the robots meta directive content attribute"""
+
+    def test_nofollow_variants(self):
+        config = linkcheck.configuration.Configuration()
+        aggregate = linkcheck.director.get_aggregate(config)
+        url = "http://example.org"
+        url_data = get_url_from(url, 0, aggregate)
+        url_data.content_type = "text/html"
+
+        url_data.soup = make_soup('<meta name="robots" content="nofollow">')
+        self.assertFalse(url_data.content_allows_robots())
+
+        url_data.soup = make_soup('<meta name="robots" content="nocache, Nofollow, noimageindex">')
+        self.assertFalse(url_data.content_allows_robots())
+
+        url_data.soup = make_soup('<meta name="robots" content="noindex, follow">')
+        self.assertTrue(url_data.content_allows_robots())
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@ -15,50 +15,34 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
-HTML parser handler test class.
+HTML parser test function.
 """

-import sys

-
-class HtmlPrettyPrinter:
+def pretty_print_html(fd, soup):
    """
-    Print out all parsed HTML data in encoded form.
-    Also stores error and warnings messages.
+    Print out all parsed HTML data,
+    writing to the given file descriptor.
+
+    @param fd: file like object
+    @type fd: file
+    @param soup: BeautifulSoup object
+    @type soup: BeautifulSoup
    """
+    for element in soup.find_all(True):
+        tag = element.name
+        element_text = element.text.strip()

-    def __init__ (self, fd=sys.stdout, encoding="iso8859-1"):
-        """
-        Write to given file descriptor in given encoding.
-
-        @param fd: file like object (default=sys.stdout)
-        @type fd: file
-        @param encoding: encoding (default=iso8859-1)
-        @type encoding: string
-        """
-        self.fd = fd
-        self.encoding = encoding
-
-    def start_element (self, tag, attrs, element_text, lineno, column):
-        """
-        Print HTML start element.
-
-        @param tag: tag name
-        @type tag: string
-        @param attrs: tag attributes
-        @type attrs: dict
-        @return: None
-        """
-        self.fd.write("<%s" % tag.replace("/", ""))
-        for key, val in sorted(attrs.items()):
+        fd.write("<%s" % tag.replace("/", ""))
+        for key, val in sorted(element.attrs.items()):
            if val is None:
-                self.fd.write(" %s" % key)
+                fd.write(" %s" % key)
            else:
-                self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
+                fd.write(' %s="%s"' % (key, quote_attrval(val)))
        if element_text:
-            self.fd.write(">%s</%s>" % (element_text, tag))
+            fd.write(">%s</%s>" % (element_text, tag))
        else:
-            self.fd.write("/>")
+            fd.write("/>")


 def quote_attrval (s):
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@ -29,8 +29,8 @@ class TestLinkparser (unittest.TestCase):

    def _test_one_link (self, content, url):
        self.count_url = 0
-        h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
-        htmlsoup.process_soup(h, htmlsoup.make_soup(content))
+        linkparse.find_links(htmlsoup.make_soup(content),
+                             self._test_one_url(url), linkparse.LinkTags)
        self.assertEqual(self.count_url, 1)

    def _test_one_url (self, origurl):
@ -43,8 +43,8 @@ class TestLinkparser (unittest.TestCase):
    def _test_no_link (self, content):
        def callback (url, line, column, name, base):
            self.assertTrue(False, 'URL %r found' % url)
-        h = linkparse.LinkFinder(callback, linkparse.LinkTags)
-        htmlsoup.process_soup(h, htmlsoup.make_soup(content))
+        linkparse.find_links(htmlsoup.make_soup(content), callback,
+                             linkparse.LinkTags)

    def test_href_parsing (self):
        # Test <a href> parsing.
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -25,7 +25,7 @@ import unittest

 from parameterized import parameterized

-from .htmllib import HtmlPrettyPrinter
+from .htmllib import pretty_print_html

 # list of tuples
 # (<test pattern>, <expected parse output>)
@ -142,8 +142,7 @@ class TestParser (unittest.TestCase):
    def test_parse (self, _in, _out):
        # Parse all test patterns in one go.
        out = StringIO()
-        handler = HtmlPrettyPrinter(out)
-        htmlsoup.process_soup(handler, htmlsoup.make_soup(_in))
+        pretty_print_html(out, htmlsoup.make_soup(_in))
        self.check_results(_in, _out, out)

    def check_results (self, _in, _out, out):
@ -180,8 +179,5 @@ class TestParser (unittest.TestCase):
        self.encoding_test(html, "ascii")

    def encoding_test (self, html, expected):
-        out = StringIO()
-        handler = HtmlPrettyPrinter(out)
        soup = htmlsoup.make_soup(html)
-        htmlsoup.process_soup(handler, soup)
        self.assertEqual(soup.original_encoding, expected)