mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-22 08:50:24 +00:00
Don't check for robots.txt directives in local html files.
This commit is contained in:
parent
e528d5f7db
commit
ce733ae76b
3 changed files with 30 additions and 58 deletions
|
|
@ -24,6 +24,8 @@ from cStringIO import StringIO
|
|||
from .. import (log, LOG_CHECK, strformat, fileutil,
|
||||
url as urlutil, LinkCheckerError)
|
||||
from . import (internpaturl, proxysupport, httpheaders as headers)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import linkparse
|
||||
# import warnings
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT
|
||||
|
||||
|
|
@ -60,6 +62,31 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
return self.aggregate.robots_txt.allows_url(self)
|
||||
|
||||
def content_allows_robots (self):
|
||||
"""
|
||||
Return False if the content of this URL forbids robots to
|
||||
search for recursive links.
|
||||
"""
|
||||
if not self.is_html():
|
||||
return True
|
||||
# construct parser object
|
||||
handler = linkparse.MetaRobotsFinder()
|
||||
parser = htmlsax.parser(handler)
|
||||
handler.parser = parser
|
||||
if self.charset:
|
||||
parser.encoding = self.charset
|
||||
# parse
|
||||
try:
|
||||
parser.feed(self.get_content())
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
return handler.follow
|
||||
|
||||
def add_size_info (self):
|
||||
"""Get size of URL content from HTTP header."""
|
||||
if self.headers and "Content-Length" in self.headers and \
|
||||
|
|
|
|||
|
|
@ -31,8 +31,6 @@ from cStringIO import StringIO
|
|||
from . import absolute_url, get_url_from
|
||||
from .. import (log, LOG_CHECK,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import linkparse
|
||||
from ..network import iputil
|
||||
from .const import (WARN_URL_EFFECTIVE_URL,
|
||||
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
|
||||
|
|
@ -551,33 +549,9 @@ class UrlBase (object):
|
|||
log.debug(LOG_CHECK, "... yes, recursion.")
|
||||
return True
|
||||
|
||||
def content_allows_robots (self):
|
||||
"""
|
||||
Return False if the content of this URL forbids robots to
|
||||
search for recursive links.
|
||||
"""
|
||||
# XXX cleanup
|
||||
if not self.is_html():
|
||||
return True
|
||||
if not (self.is_http() or self.is_file()):
|
||||
return True
|
||||
# construct parser object
|
||||
handler = linkparse.MetaRobotsFinder()
|
||||
parser = htmlsax.parser(handler)
|
||||
handler.parser = parser
|
||||
if self.charset:
|
||||
parser.encoding = self.charset
|
||||
# parse
|
||||
try:
|
||||
parser.feed(self.get_content())
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
return handler.follow
|
||||
def content_allows_robots(self):
|
||||
"""Returns True: only check robots.txt on HTTP links."""
|
||||
return True
|
||||
|
||||
def set_extern (self, url):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -1,29 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2009 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Test HTML robots.txt parsing.
|
||||
"""
|
||||
from . import LinkCheckTest
|
||||
|
||||
|
||||
class TestRobotsTxt (LinkCheckTest):
|
||||
"""
|
||||
Test robots.txt directive parsing in HTML files.
|
||||
"""
|
||||
|
||||
def test_norobot (self):
|
||||
self.file_test("norobots.html")
|
||||
Loading…
Reference in a new issue