From 4ffdbf24068eb35ba0eb2fb243b67fe10bef95e4 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Wed, 29 Apr 2020 20:07:00 +0100 Subject: [PATCH] Replace MetaRobotsFinder using BeautifulSoup.find() --- linkcheck/checker/httpurl.py | 18 ++++++++---------- linkcheck/htmlutil/linkparse.py | 25 ------------------------- tests/checker/data/norobots.html | 2 +- 3 files changed, 9 insertions(+), 36 deletions(-) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index e891402e..2752e3a8 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -26,11 +26,12 @@ import warnings warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning) from io import BytesIO +import re from .. import (log, LOG_CHECK, strformat, mimeutil, url as urlutil, LinkCheckerError, httputil) from . import (internpaturl, proxysupport) -from ..htmlutil import htmlsoup, linkparse +from ..htmlutil import htmlsoup # import warnings from .const import WARN_HTTP_EMPTY_CONTENT from requests.sessions import REDIRECT_STATI @@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://') # helper alias unicode_safe = strformat.unicode_safe +# match for robots meta element content attribute +nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE) + class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ Url link with http scheme. @@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ if not self.is_html(): return True - # construct handler object - handler = linkparse.MetaRobotsFinder() - # parse - try: - htmlsoup.process_soup(handler, self.get_soup()) - except linkparse.StopParse as msg: - log.debug(LOG_CHECK, "Stopped parsing: %s", msg) - pass - return handler.follow + + soup = self.get_soup() + return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re}) def add_size_info (self): """Get size of URL content from HTTP header.""" diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index 0f5bc7ef..968f5b20 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -98,11 +98,6 @@ def strip_c_comments (text): return c_comment_re.sub('', text) -class StopParse(Exception): - """Raised when parsing should stop.""" - pass - - class TagFinder (object): """Base class handling HTML start elements. TagFinder instances are used as HTML parser handlers.""" @@ -116,26 +111,6 @@ class TagFinder (object): pass -class MetaRobotsFinder (TagFinder): - """Class for finding robots.txt meta values in HTML.""" - - def __init__ (self): - """Initialize follow and index flags.""" - super(MetaRobotsFinder, self).__init__() - log.debug(LOG_CHECK, "meta robots finder") - self.follow = self.index = True - - def start_element (self, tag, attrs, element_text, lineno, column): - """Search for meta robots.txt "nofollow" and "noindex" flags.""" - if tag == 'meta' and attrs.get('name') == 'robots': - val = attrs.get('content', u'').lower().split(u',') - self.follow = u'nofollow' not in val - self.index = u'noindex' not in val - raise StopParse("found tag") - elif tag == 'body': - raise StopParse("found tag") - - def is_meta_url (attr, attrs): """Check if the meta attributes contain a URL.""" res = False diff --git a/tests/checker/data/norobots.html b/tests/checker/data/norobots.html index 795e8174..17c6826b 100644 --- a/tests/checker/data/norobots.html +++ b/tests/checker/data/norobots.html @@ -1,2 +1,2 @@ - + bla