mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-17 06:20:27 +00:00
Replace MetaRobotsFinder using BeautifulSoup.find()
This commit is contained in:
parent
350f8bfef9
commit
4ffdbf2406
3 changed files with 9 additions and 36 deletions
|
|
@ -26,11 +26,12 @@ import warnings
|
|||
warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
from io import BytesIO
|
||||
import re
|
||||
|
||||
from .. import (log, LOG_CHECK, strformat, mimeutil,
|
||||
url as urlutil, LinkCheckerError, httputil)
|
||||
from . import (internpaturl, proxysupport)
|
||||
from ..htmlutil import htmlsoup, linkparse
|
||||
from ..htmlutil import htmlsoup
|
||||
# import warnings
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT
|
||||
from requests.sessions import REDIRECT_STATI
|
||||
|
|
@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://')
|
|||
# helper alias
|
||||
unicode_safe = strformat.unicode_safe
|
||||
|
||||
# match for robots meta element content attribute
|
||||
nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
|
||||
|
||||
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
||||
"""
|
||||
Url link with http scheme.
|
||||
|
|
@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
if not self.is_html():
|
||||
return True
|
||||
# construct handler object
|
||||
handler = linkparse.MetaRobotsFinder()
|
||||
# parse
|
||||
try:
|
||||
htmlsoup.process_soup(handler, self.get_soup())
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
return handler.follow
|
||||
|
||||
soup = self.get_soup()
|
||||
return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re})
|
||||
|
||||
def add_size_info (self):
|
||||
"""Get size of URL content from HTTP header."""
|
||||
|
|
|
|||
|
|
@ -98,11 +98,6 @@ def strip_c_comments (text):
|
|||
return c_comment_re.sub('', text)
|
||||
|
||||
|
||||
class StopParse(Exception):
|
||||
"""Raised when parsing should stop."""
|
||||
pass
|
||||
|
||||
|
||||
class TagFinder (object):
|
||||
"""Base class handling HTML start elements.
|
||||
TagFinder instances are used as HTML parser handlers."""
|
||||
|
|
@ -116,26 +111,6 @@ class TagFinder (object):
|
|||
pass
|
||||
|
||||
|
||||
class MetaRobotsFinder (TagFinder):
|
||||
"""Class for finding robots.txt meta values in HTML."""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize follow and index flags."""
|
||||
super(MetaRobotsFinder, self).__init__()
|
||||
log.debug(LOG_CHECK, "meta robots finder")
|
||||
self.follow = self.index = True
|
||||
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
|
||||
if tag == 'meta' and attrs.get('name') == 'robots':
|
||||
val = attrs.get('content', u'').lower().split(u',')
|
||||
self.follow = u'nofollow' not in val
|
||||
self.index = u'noindex' not in val
|
||||
raise StopParse("found <meta name=robots> tag")
|
||||
elif tag == 'body':
|
||||
raise StopParse("found <body> tag")
|
||||
|
||||
|
||||
def is_meta_url (attr, attrs):
|
||||
"""Check if the meta attributes contain a URL."""
|
||||
res = False
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
<meta name="robots" content="nofollow">
|
||||
<meta name="robots" content="noindex, Nofollow">
|
||||
<a href="do_not_check.html">bla</a>
|
||||
|
|
|
|||
Loading…
Reference in a new issue