mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-29 10:34:42 +00:00
avoid double timeouts by raising timeout errors in robots.txt retrieval
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3171 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
9a431fde40
commit
2cfcb5c0bb
2 changed files with 17 additions and 1 deletions
|
|
@ -25,6 +25,12 @@
|
|||
Type: feature
|
||||
Changed: linckheck/checker/internpaturl.py
|
||||
|
||||
* If the robots.txt connection times out, don't bother to check
|
||||
the URL but report an error immediately. Avoids having the
|
||||
timeout twice.
|
||||
Type: feature
|
||||
Changed: linkcheck/robotparser2.py
|
||||
|
||||
3.4 "The Chumscrubbers" (released 4.2.2006)
|
||||
|
||||
* Ignore decoding errors when retrieving the robots.txt URL.
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import socket
|
|||
import re
|
||||
import zlib
|
||||
import gzip
|
||||
import sys
|
||||
import cStringIO as StringIO
|
||||
import linkcheck
|
||||
import linkcheck.configuration
|
||||
|
|
@ -174,7 +175,16 @@ class RobotFileParser (object):
|
|||
self.allow_all = True
|
||||
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"%s allow all", self.url)
|
||||
except (socket.gaierror, socket.error, urllib2.URLError), x:
|
||||
except socket.timeout:
|
||||
raise
|
||||
except urllib2.URLError:
|
||||
x = sys.exc_info()[1]
|
||||
if isinstance(x.reason, socket.timeout):
|
||||
raise
|
||||
self.allow_all = True
|
||||
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"%s allow all", self.url)
|
||||
except (socket.gaierror, socket.error):
|
||||
# no network
|
||||
self.allow_all = True
|
||||
assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
|
|
|
|||
Loading…
Reference in a new issue