# -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2009 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ Test robots.txt parsing. """ import unittest from tests import has_network from nose import SkipTest import linkcheck.robotparser2 class TestRobotParser (unittest.TestCase): """ Test robots.txt parser (needs internet access). """ def setUp (self): """ Initialize self.rp as a robots.txt parser. """ self.rp = linkcheck.robotparser2.RobotFileParser() def check (self, a, b): """ Helper function comparing two results a and b. """ if not b: ac = "access denied" else: ac = "access allowed" if a != b: self.fail("%s != %s (%s)" % (a, b, ac)) def test_existing_robots (self): """ Test parsing and access of an existing robots.txt file. """ if not has_network(): raise SkipTest() # robots.txt that exists (use web archive to be sure to have the # same robots.txt every time). self.rp.set_url('http://web.archive.org/web/20050312093828/http://www.musi-cal.com/robots.txt') self.rp.read() # test for re.escape self.check(self.rp.can_fetch('*', 'http://www.musi-cal.com/'), True) # this should match the first rule, which is a disallow self.check(self.rp.can_fetch('', 'http://www.musi-cal.com/'), False) # various cherry pickers self.check(self.rp.can_fetch('CherryPickerSE', 'http://www.musi-cal.com/cgi-bin/event-search' '?city=San+Francisco'), False) self.check(self.rp.can_fetch('CherryPickerSE/1.0', 'http://www.musi-cal.com/cgi-bin/event-search' '?city=San+Francisco'), False) self.check(self.rp.can_fetch('CherryPickerSE/1.5', 'http://www.musi-cal.com/cgi-bin/event-search' '?city=San+Francisco'), False) # case sensitivity self.check(self.rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), False) self.check(self.rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), False) # substring test self.check(self.rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), False) # tests for catch-all * agent self.check(self.rp.can_fetch('spam', 'http://www.musi-cal.com/vsearch'), False) self.check(self.rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), True) self.check(self.rp.can_fetch('spam', 'http://www.musi-cal.com/'), True) self.check(self.rp.can_fetch('spam', 'http://www.musi-cal.com/'), True) def test_nonexisting_robots (self): """ Test access of a non-existing robots.txt file. """ if not has_network(): raise SkipTest() # robots.txt that does not exist self.rp.set_url('http://www.lycos.com/robots.txt') self.rp.read() self.check(self.rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), True) def test_password_robots (self): # whole site is password-protected. if not has_network(): raise SkipTest() self.rp.set_url('http://mueblesmoraleda.com/robots.txt') self.rp.read() self.check(self.rp.can_fetch("*", "http://mueblesmoraleda.com/"), False)