robotparser2

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@215 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-10 23:53:11 +00:00 · 2001-01-04 23:16:46 +00:00 · 2001-01-04 23:16:46 +00:00 · f84023c401
commit f84023c401
parent 0ea3aae096
2 changed files with 31 additions and 4 deletions
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -91,7 +91,8 @@ class HttpUrlData(UrlData):
            redirected = self.urlName
            while status in [301,302] and self.mime and tries < 5:
                has301status = (status==301)
-                redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
+                newurl = self.mime.get("Location", self.mime.get("Uri", ""))
+                redirected = urlparse.urljoin(redirected, newurl)
                self.urlTuple = urlparse.urlparse(redirected)
                status, statusText, self.mime = self._getHttpRequest()
                Config.debug("DEBUG: Redirected\n"+str(self.mime))
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -1,6 +1,23 @@
 """ robotparser2.py

    Copyright (C) 2000  Bastian Kleineidam
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+    The robots.txt Exclusion Protocol is implemented as specified in
+    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
 """
 import re,string,urlparse,urllib

@ -26,6 +43,7 @@ class RobotFileParser:
        self.last_checked = time.time()

    def set_url(self, url):
+        self.url = url
        self.host, self.path = urlparse.urlparse(url)[1:3]

    def read(self):
@ -38,7 +56,10 @@ class RobotFileParser:
            status, text, mime = connection.getreply()
            if status in [301,302]:
                tries = tries + 1
-                self.set_url(mime.getheader("Location"))
+                newurl = self.mime.get("Location", self.mime.get("Uri", ""))
+                newurl = urlparse.urljoin(self.url, newurl)
+                _debug(newurl)
+                self.set_url(newurl)
            else:
                break
        if status==401 or status==403:
@ -177,10 +198,15 @@ class Entry:


 def _test():
+    global debug
+    import sys
    rp = RobotFileParser()
    debug = 1
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
+    if len(sys.argv) <= 1:
+        rp.set_url('http://www.musi-cal.com/robots.txt')
+        rp.read()
+    else:
+        rp.parse(open(sys.argv[1]).readlines())
    print rp
    print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
    print rp.can_fetch('Musi-Cal-Robot',