Replace MetaRobotsFinder using BeautifulSoup.find()

This commit is contained in:
Chris Mayo 2020-04-29 20:07:00 +01:00
parent 350f8bfef9
commit 4ffdbf2406
3 changed files with 9 additions and 36 deletions

View file

@ -26,11 +26,12 @@ import warnings
warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
from io import BytesIO
import re
from .. import (log, LOG_CHECK, strformat, mimeutil,
url as urlutil, LinkCheckerError, httputil)
from . import (internpaturl, proxysupport)
from ..htmlutil import htmlsoup, linkparse
from ..htmlutil import htmlsoup
# import warnings
from .const import WARN_HTTP_EMPTY_CONTENT
from requests.sessions import REDIRECT_STATI
@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://')
# helper alias
unicode_safe = strformat.unicode_safe
# match for robots meta element content attribute
nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Url link with http scheme.
@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
if not self.is_html():
return True
# construct handler object
handler = linkparse.MetaRobotsFinder()
# parse
try:
htmlsoup.process_soup(handler, self.get_soup())
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
return handler.follow
soup = self.get_soup()
return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re})
def add_size_info (self):
"""Get size of URL content from HTTP header."""

View file

@ -98,11 +98,6 @@ def strip_c_comments (text):
return c_comment_re.sub('', text)
class StopParse(Exception):
"""Raised when parsing should stop."""
pass
class TagFinder (object):
"""Base class handling HTML start elements.
TagFinder instances are used as HTML parser handlers."""
@ -116,26 +111,6 @@ class TagFinder (object):
pass
class MetaRobotsFinder (TagFinder):
"""Class for finding robots.txt meta values in HTML."""
def __init__ (self):
"""Initialize follow and index flags."""
super(MetaRobotsFinder, self).__init__()
log.debug(LOG_CHECK, "meta robots finder")
self.follow = self.index = True
def start_element (self, tag, attrs, element_text, lineno, column):
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
if tag == 'meta' and attrs.get('name') == 'robots':
val = attrs.get('content', u'').lower().split(u',')
self.follow = u'nofollow' not in val
self.index = u'noindex' not in val
raise StopParse("found <meta name=robots> tag")
elif tag == 'body':
raise StopParse("found <body> tag")
def is_meta_url (attr, attrs):
"""Check if the meta attributes contain a URL."""
res = False

View file

@ -1,2 +1,2 @@
<meta name="robots" content="nofollow">
<meta name="robots" content="noindex, Nofollow">
<a href="do_not_check.html">bla</a>