Replace MetaRobotsFinder using BeautifulSoup.find()

2026-05-06 05:34:47 +00:00 · 2020-04-29 20:07:00 +01:00 · 2020-04-29 20:07:00 +01:00 · 4ffdbf2406
commit 4ffdbf2406
parent 350f8bfef9
3 changed files with 9 additions and 36 deletions
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -26,11 +26,12 @@ import warnings
 warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)

 from io import BytesIO
+import re

 from .. import (log, LOG_CHECK, strformat, mimeutil,
    url as urlutil, LinkCheckerError, httputil)
 from . import (internpaturl, proxysupport)
-from ..htmlutil import htmlsoup, linkparse
+from ..htmlutil import htmlsoup
 # import warnings
 from .const import WARN_HTTP_EMPTY_CONTENT
 from requests.sessions import REDIRECT_STATI
@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://')
 # helper alias
 unicode_safe = strformat.unicode_safe

+# match for robots meta element content attribute
+nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
+
 class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
    """
    Url link with http scheme.
@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """
        if not self.is_html():
            return True
-        # construct handler object
-        handler = linkparse.MetaRobotsFinder()
-        # parse
-        try:
-            htmlsoup.process_soup(handler, self.get_soup())
-        except linkparse.StopParse as msg:
-            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
-            pass
-        return handler.follow
+
+        soup = self.get_soup()
+        return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re})

    def add_size_info (self):
        """Get size of URL content from HTTP header."""
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -98,11 +98,6 @@ def strip_c_comments (text):
    return c_comment_re.sub('', text)


-class StopParse(Exception):
-    """Raised when parsing should stop."""
-    pass
-
-
 class TagFinder (object):
    """Base class handling HTML start elements.
    TagFinder instances are used as HTML parser handlers."""
@ -116,26 +111,6 @@ class TagFinder (object):
        pass


-class MetaRobotsFinder (TagFinder):
-    """Class for finding robots.txt meta values in HTML."""
-
-    def __init__ (self):
-        """Initialize follow and index flags."""
-        super(MetaRobotsFinder, self).__init__()
-        log.debug(LOG_CHECK, "meta robots finder")
-        self.follow = self.index = True
-
-    def start_element (self, tag, attrs, element_text, lineno, column):
-        """Search for meta robots.txt "nofollow" and "noindex" flags."""
-        if tag == 'meta' and attrs.get('name') == 'robots':
-            val = attrs.get('content', u'').lower().split(u',')
-            self.follow = u'nofollow' not in val
-            self.index = u'noindex' not in val
-            raise StopParse("found <meta name=robots> tag")
-        elif tag == 'body':
-            raise StopParse("found <body> tag")
-
-
 def is_meta_url (attr, attrs):
    """Check if the meta attributes contain a URL."""
    res = False
--- a/tests/checker/data/norobots.html
+++ b/tests/checker/data/norobots.html
@ -1,2 +1,2 @@
-<meta name="robots" content="nofollow">
+<meta name="robots" content="noindex, Nofollow">
 <a href="do_not_check.html">bla</a>