From 4ffdbf24068eb35ba0eb2fb243b67fe10bef95e4 Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Wed, 29 Apr 2020 20:07:00 +0100
Subject: [PATCH] Replace MetaRobotsFinder using BeautifulSoup.find()

---
 linkcheck/checker/httpurl.py     | 18 ++++++++----------
 linkcheck/htmlutil/linkparse.py  | 25 -------------------------
 tests/checker/data/norobots.html |  2 +-
 3 files changed, 9 insertions(+), 36 deletions(-)
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index e891402e..2752e3a8 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -26,11 +26,12 @@ import warnings
 warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
 
 from io import BytesIO
+import re
 
 from .. import (log, LOG_CHECK, strformat, mimeutil,
     url as urlutil, LinkCheckerError, httputil)
 from . import (internpaturl, proxysupport)
-from ..htmlutil import htmlsoup, linkparse
+from ..htmlutil import htmlsoup
 # import warnings
 from .const import WARN_HTTP_EMPTY_CONTENT
 from requests.sessions import REDIRECT_STATI
@@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://')
 # helper alias
 unicode_safe = strformat.unicode_safe
 
+# match for robots meta element content attribute
+nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
+
 class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
     """
     Url link with http scheme.
@@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
         """
         if not self.is_html():
             return True
-        # construct handler object
-        handler = linkparse.MetaRobotsFinder()
-        # parse
-        try:
-            htmlsoup.process_soup(handler, self.get_soup())
-        except linkparse.StopParse as msg:
-            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
-            pass
-        return handler.follow
+
+        soup = self.get_soup()
+        return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re})
 
     def add_size_info (self):
         """Get size of URL content from HTTP header."""
diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py
index 0f5bc7ef..968f5b20 100644
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@@ -98,11 +98,6 @@ def strip_c_comments (text):
     return c_comment_re.sub('', text)
 
 
-class StopParse(Exception):
-    """Raised when parsing should stop."""
-    pass
-
-
 class TagFinder (object):
     """Base class handling HTML start elements.
     TagFinder instances are used as HTML parser handlers."""
@@ -116,26 +111,6 @@ class TagFinder (object):
         pass
 
 
-class MetaRobotsFinder (TagFinder):
-    """Class for finding robots.txt meta values in HTML."""
-
-    def __init__ (self):
-        """Initialize follow and index flags."""
-        super(MetaRobotsFinder, self).__init__()
-        log.debug(LOG_CHECK, "meta robots finder")
-        self.follow = self.index = True
-
-    def start_element (self, tag, attrs, element_text, lineno, column):
-        """Search for meta robots.txt "nofollow" and "noindex" flags."""
-        if tag == 'meta' and attrs.get('name') == 'robots':
-            val = attrs.get('content', u'').lower().split(u',')
-            self.follow = u'nofollow' not in val
-            self.index = u'noindex' not in val
-            raise StopParse("found <meta name=robots> tag")
-        elif tag == 'body':
-            raise StopParse("found <body> tag")
-
-
 def is_meta_url (attr, attrs):
     """Check if the meta attributes contain a URL."""
     res = False
diff --git a/tests/checker/data/norobots.html b/tests/checker/data/norobots.html
index 795e8174..17c6826b 100644
--- a/tests/checker/data/norobots.html
+++ b/tests/checker/data/norobots.html
@@ -1,2 +1,2 @@
-<meta name="robots" content="nofollow">
+<meta name="robots" content="noindex, Nofollow">
 <a href="do_not_check.html">bla</a>