avoid double timeouts by raising timeout errors in robots.txt retrieval

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3171 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2006-05-14 12:58:31 +00:00
parent 9a431fde40
commit 2cfcb5c0bb
2 changed files with 17 additions and 1 deletions

View file

@ -25,6 +25,12 @@
Type: feature
Changed: linckheck/checker/internpaturl.py
* If the robots.txt connection times out, don't bother to check
the URL but report an error immediately. Avoids having the
timeout twice.
Type: feature
Changed: linkcheck/robotparser2.py
3.4 "The Chumscrubbers" (released 4.2.2006)
* Ignore decoding errors when retrieving the robots.txt URL.

View file

@ -30,6 +30,7 @@ import socket
import re
import zlib
import gzip
import sys
import cStringIO as StringIO
import linkcheck
import linkcheck.configuration
@ -174,7 +175,16 @@ class RobotFileParser (object):
self.allow_all = True
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
"%s allow all", self.url)
except (socket.gaierror, socket.error, urllib2.URLError), x:
except socket.timeout:
raise
except urllib2.URLError:
x = sys.exc_info()[1]
if isinstance(x.reason, socket.timeout):
raise
self.allow_all = True
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
"%s allow all", self.url)
except (socket.gaierror, socket.error):
# no network
self.allow_all = True
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,