From f84023c4011460ffaa0663952efd1d33962b8102 Mon Sep 17 00:00:00 2001 From: calvin Date: Thu, 4 Jan 2001 23:16:46 +0000 Subject: [PATCH] robotparser2 git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@215 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/HttpUrlData.py | 3 ++- linkcheck/robotparser2.py | 32 +++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py index af28888f..016dd2f5 100644 --- a/linkcheck/HttpUrlData.py +++ b/linkcheck/HttpUrlData.py @@ -91,7 +91,8 @@ class HttpUrlData(UrlData): redirected = self.urlName while status in [301,302] and self.mime and tries < 5: has301status = (status==301) - redirected = urlparse.urljoin(redirected, self.mime.getheader("Location")) + newurl = self.mime.get("Location", self.mime.get("Uri", "")) + redirected = urlparse.urljoin(redirected, newurl) self.urlTuple = urlparse.urlparse(redirected) status, statusText, self.mime = self._getHttpRequest() Config.debug("DEBUG: Redirected\n"+str(self.mime)) diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 5dc7f9c3..63c1ba00 100755 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -1,6 +1,23 @@ """ robotparser2.py Copyright (C) 2000 Bastian Kleineidam + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + The robots.txt Exclusion Protocol is implemented as specified in + http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html """ import re,string,urlparse,urllib @@ -26,6 +43,7 @@ class RobotFileParser: self.last_checked = time.time() def set_url(self, url): + self.url = url self.host, self.path = urlparse.urlparse(url)[1:3] def read(self): @@ -38,7 +56,10 @@ class RobotFileParser: status, text, mime = connection.getreply() if status in [301,302]: tries = tries + 1 - self.set_url(mime.getheader("Location")) + newurl = self.mime.get("Location", self.mime.get("Uri", "")) + newurl = urlparse.urljoin(self.url, newurl) + _debug(newurl) + self.set_url(newurl) else: break if status==401 or status==403: @@ -177,10 +198,15 @@ class Entry: def _test(): + global debug + import sys rp = RobotFileParser() debug = 1 - rp.set_url('http://www.musi-cal.com/robots.txt') - rp.read() + if len(sys.argv) <= 1: + rp.set_url('http://www.musi-cal.com/robots.txt') + rp.read() + else: + rp.parse(open(sys.argv[1]).readlines()) print rp print rp.can_fetch('*', 'http://www.musi-cal.com.com/') print rp.can_fetch('Musi-Cal-Robot',