From f84023c4011460ffaa0663952efd1d33962b8102 Mon Sep 17 00:00:00 2001
From: calvin <calvin@e7d03fd6-7b0d-0410-9947-9c21f3af8025>
Date: Thu, 4 Jan 2001 23:16:46 +0000
Subject: [PATCH] robotparser2

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@215 e7d03fd6-7b0d-0410-9947-9c21f3af8025
---
 linkcheck/HttpUrlData.py  |  3 ++-
 linkcheck/robotparser2.py | 32 +++++++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py
index af28888f..016dd2f5 100644
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@@ -91,7 +91,8 @@ class HttpUrlData(UrlData):
             redirected = self.urlName
             while status in [301,302] and self.mime and tries < 5:
                 has301status = (status==301)
-                redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
+                newurl = self.mime.get("Location", self.mime.get("Uri", ""))
+                redirected = urlparse.urljoin(redirected, newurl)
                 self.urlTuple = urlparse.urlparse(redirected)
                 status, statusText, self.mime = self._getHttpRequest()
                 Config.debug("DEBUG: Redirected\n"+str(self.mime))
diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py
index 5dc7f9c3..63c1ba00 100755
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@@ -1,6 +1,23 @@
 """ robotparser2.py
 
     Copyright (C) 2000  Bastian Kleineidam
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+    The robots.txt Exclusion Protocol is implemented as specified in
+    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
 """
 import re,string,urlparse,urllib
 
@@ -26,6 +43,7 @@ class RobotFileParser:
         self.last_checked = time.time()
 
     def set_url(self, url):
+        self.url = url
         self.host, self.path = urlparse.urlparse(url)[1:3]
 
     def read(self):
@@ -38,7 +56,10 @@ class RobotFileParser:
             status, text, mime = connection.getreply()
             if status in [301,302]:
                 tries = tries + 1
-                self.set_url(mime.getheader("Location"))
+                newurl = self.mime.get("Location", self.mime.get("Uri", ""))
+                newurl = urlparse.urljoin(self.url, newurl)
+                _debug(newurl)
+                self.set_url(newurl)
             else:
                 break
         if status==401 or status==403:
@@ -177,10 +198,15 @@ class Entry:
 
 
 def _test():
+    global debug
+    import sys
     rp = RobotFileParser()
     debug = 1
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
+    if len(sys.argv) <= 1:
+        rp.set_url('http://www.musi-cal.com/robots.txt')
+        rp.read()
+    else:
+        rp.parse(open(sys.argv[1]).readlines())
     print rp
     print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
     print rp.can_fetch('Musi-Cal-Robot',