avoid double timeouts by raising timeout errors in robots.txt retrieval

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3171 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-29 10:34:42 +00:00 · 2006-05-14 12:58:31 +00:00 · 2006-05-14 12:58:31 +00:00 · 2cfcb5c0bb
commit 2cfcb5c0bb
parent 9a431fde40
2 changed files with 17 additions and 1 deletions
--- a/6
+++ b/6
@ -25,6 +25,12 @@
    Type: feature
    Changed: linckheck/checker/internpaturl.py

+  * If the robots.txt connection times out, don't bother to check
+    the URL but report an error immediately. Avoids having the
+    timeout twice.
+    Type: feature
+    Changed: linkcheck/robotparser2.py
+
 3.4 "The Chumscrubbers" (released 4.2.2006)

  * Ignore decoding errors when retrieving the robots.txt URL.
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -30,6 +30,7 @@ import socket
 import re
 import zlib
 import gzip
+import sys
 import cStringIO as StringIO
 import linkcheck
 import linkcheck.configuration
@ -174,7 +175,16 @@ class RobotFileParser (object):
                self.allow_all = True
                assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
                    "%s allow all", self.url)
-        except (socket.gaierror, socket.error, urllib2.URLError), x:
+        except socket.timeout:
+            raise
+        except urllib2.URLError:
+            x = sys.exc_info()[1]
+            if isinstance(x.reason, socket.timeout):
+                raise
+            self.allow_all = True
+            assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
+                "%s allow all", self.url)
+        except (socket.gaierror, socket.error):
            # no network
            self.allow_all = True
            assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,