Don't check for robots.txt directives in local html files.

This commit is contained in:
Bastian Kleineidam 2014-03-19 16:33:22 +01:00
parent e528d5f7db
commit ce733ae76b
3 changed files with 30 additions and 58 deletions

View file

@ -24,6 +24,8 @@ from cStringIO import StringIO
from .. import (log, LOG_CHECK, strformat, fileutil,
url as urlutil, LinkCheckerError)
from . import (internpaturl, proxysupport, httpheaders as headers)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
# import warnings
from .const import WARN_HTTP_EMPTY_CONTENT
@ -60,6 +62,31 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
return self.aggregate.robots_txt.allows_url(self)
def content_allows_robots (self):
"""
Return False if the content of this URL forbids robots to
search for recursive links.
"""
if not self.is_html():
return True
# construct parser object
handler = linkparse.MetaRobotsFinder()
parser = htmlsax.parser(handler)
handler.parser = parser
if self.charset:
parser.encoding = self.charset
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
# break cyclic dependencies
handler.parser = None
parser.handler = None
return handler.follow
def add_size_info (self):
"""Get size of URL content from HTTP header."""
if self.headers and "Content-Length" in self.headers and \

View file

@ -31,8 +31,6 @@ from cStringIO import StringIO
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@ -551,33 +549,9 @@ class UrlBase (object):
log.debug(LOG_CHECK, "... yes, recursion.")
return True
def content_allows_robots (self):
"""
Return False if the content of this URL forbids robots to
search for recursive links.
"""
# XXX cleanup
if not self.is_html():
return True
if not (self.is_http() or self.is_file()):
return True
# construct parser object
handler = linkparse.MetaRobotsFinder()
parser = htmlsax.parser(handler)
handler.parser = parser
if self.charset:
parser.encoding = self.charset
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
# break cyclic dependencies
handler.parser = None
parser.handler = None
return handler.follow
def content_allows_robots(self):
"""Returns True: only check robots.txt on HTTP links."""
return True
def set_extern (self, url):
"""

View file

@ -1,29 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2009 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test HTML robots.txt parsing.
"""
from . import LinkCheckTest
class TestRobotsTxt (LinkCheckTest):
"""
Test robots.txt directive parsing in HTML files.
"""
def test_norobot (self):
self.file_test("norobots.html")