2004-08-16 19:20:06 +00:00
|
|
|
# -*- coding: iso-8859-1 -*-
|
2009-01-08 14:18:03 +00:00
|
|
|
# Copyright (C) 2004-2009 Bastian Kleineidam
|
2004-08-16 19:20:06 +00:00
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
2005-01-19 15:08:02 +00:00
|
|
|
"""
|
|
|
|
|
Test robots.txt parsing.
|
|
|
|
|
"""
|
2004-08-16 19:20:06 +00:00
|
|
|
|
|
|
|
|
import unittest
|
2009-01-28 23:12:03 +00:00
|
|
|
from tests import has_network
|
|
|
|
|
from nose import SkipTest
|
2004-08-16 19:20:06 +00:00
|
|
|
import linkcheck.robotparser2
|
|
|
|
|
|
|
|
|
|
|
2006-04-24 20:16:57 +00:00
|
|
|
class TestRobotParser (unittest.TestCase):
|
2005-01-19 14:38:01 +00:00
|
|
|
"""
|
|
|
|
|
Test robots.txt parser (needs internet access).
|
|
|
|
|
"""
|
2004-08-16 19:20:06 +00:00
|
|
|
|
|
|
|
|
def setUp (self):
|
2005-01-19 14:38:01 +00:00
|
|
|
"""
|
|
|
|
|
Initialize self.rp as a robots.txt parser.
|
|
|
|
|
"""
|
2004-08-16 19:20:06 +00:00
|
|
|
self.rp = linkcheck.robotparser2.RobotFileParser()
|
|
|
|
|
|
|
|
|
|
def check (self, a, b):
|
2005-01-19 14:38:01 +00:00
|
|
|
"""
|
|
|
|
|
Helper function comparing two results a and b.
|
|
|
|
|
"""
|
2004-08-16 19:20:06 +00:00
|
|
|
if not b:
|
|
|
|
|
ac = "access denied"
|
|
|
|
|
else:
|
|
|
|
|
ac = "access allowed"
|
|
|
|
|
if a != b:
|
|
|
|
|
self.fail("%s != %s (%s)" % (a, b, ac))
|
|
|
|
|
|
|
|
|
|
def test_existing_robots (self):
|
2005-01-19 14:38:01 +00:00
|
|
|
"""
|
|
|
|
|
Test parsing and access of an existing robots.txt file.
|
|
|
|
|
"""
|
2009-01-28 23:12:03 +00:00
|
|
|
if not has_network():
|
|
|
|
|
raise SkipTest()
|
2006-10-19 20:37:21 +00:00
|
|
|
# robots.txt that exists (use web archive to be sure to have the
|
|
|
|
|
# same robots.txt every time).
|
2005-10-11 10:08:58 +00:00
|
|
|
self.rp.set_url('http://web.archive.org/web/20050312093828/http://www.musi-cal.com/robots.txt')
|
2004-08-16 19:20:06 +00:00
|
|
|
self.rp.read()
|
|
|
|
|
# test for re.escape
|
|
|
|
|
self.check(self.rp.can_fetch('*', 'http://www.musi-cal.com/'), True)
|
2006-10-19 20:37:21 +00:00
|
|
|
# this should match the first rule, which is a disallow
|
|
|
|
|
self.check(self.rp.can_fetch('', 'http://www.musi-cal.com/'), False)
|
2004-08-16 19:20:06 +00:00
|
|
|
# various cherry pickers
|
|
|
|
|
self.check(self.rp.can_fetch('CherryPickerSE',
|
|
|
|
|
'http://www.musi-cal.com/cgi-bin/event-search'
|
|
|
|
|
'?city=San+Francisco'), False)
|
|
|
|
|
self.check(self.rp.can_fetch('CherryPickerSE/1.0',
|
|
|
|
|
'http://www.musi-cal.com/cgi-bin/event-search'
|
|
|
|
|
'?city=San+Francisco'), False)
|
|
|
|
|
self.check(self.rp.can_fetch('CherryPickerSE/1.5',
|
|
|
|
|
'http://www.musi-cal.com/cgi-bin/event-search'
|
|
|
|
|
'?city=San+Francisco'), False)
|
|
|
|
|
# case sensitivity
|
|
|
|
|
self.check(self.rp.can_fetch('ExtractorPro',
|
|
|
|
|
'http://www.musi-cal.com/blubba'), False)
|
|
|
|
|
self.check(self.rp.can_fetch('extractorpro',
|
|
|
|
|
'http://www.musi-cal.com/blubba'), False)
|
|
|
|
|
# substring test
|
|
|
|
|
self.check(self.rp.can_fetch('toolpak/1.1',
|
|
|
|
|
'http://www.musi-cal.com/blubba'), False)
|
|
|
|
|
# tests for catch-all * agent
|
|
|
|
|
self.check(self.rp.can_fetch('spam',
|
|
|
|
|
'http://www.musi-cal.com/vsearch'), False)
|
|
|
|
|
self.check(self.rp.can_fetch('spam',
|
|
|
|
|
'http://www.musi-cal.com/Musician/me'), True)
|
|
|
|
|
self.check(self.rp.can_fetch('spam',
|
|
|
|
|
'http://www.musi-cal.com/'), True)
|
|
|
|
|
self.check(self.rp.can_fetch('spam',
|
|
|
|
|
'http://www.musi-cal.com/'), True)
|
|
|
|
|
|
|
|
|
|
def test_nonexisting_robots (self):
|
2005-01-19 14:38:01 +00:00
|
|
|
"""
|
|
|
|
|
Test access of a non-existing robots.txt file.
|
|
|
|
|
"""
|
2009-01-28 23:12:03 +00:00
|
|
|
if not has_network():
|
|
|
|
|
raise SkipTest()
|
2004-08-16 19:20:06 +00:00
|
|
|
# robots.txt that does not exist
|
|
|
|
|
self.rp.set_url('http://www.lycos.com/robots.txt')
|
|
|
|
|
self.rp.read()
|
|
|
|
|
self.check(self.rp.can_fetch('Mozilla',
|
|
|
|
|
'http://www.lycos.com/search'), True)
|
|
|
|
|
|
2007-10-02 01:07:12 +00:00
|
|
|
def test_password_robots (self):
|
|
|
|
|
# whole site is password-protected.
|
2009-01-28 23:12:03 +00:00
|
|
|
if not has_network():
|
|
|
|
|
raise SkipTest()
|
2007-10-02 01:07:12 +00:00
|
|
|
self.rp.set_url('http://mueblesmoraleda.com/robots.txt')
|
|
|
|
|
self.rp.read()
|
|
|
|
|
self.check(self.rp.can_fetch("*",
|
|
|
|
|
"http://mueblesmoraleda.com/"), False)
|