fix amazon timeouts

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@785 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2003-02-06 01:23:36 +00:00
parent fbb2858312
commit b570df925b
2 changed files with 17 additions and 4 deletions

View file

@ -1,3 +1,8 @@
1.8.8
* All amazon servers block HEAD requests with timeouts. Use GET as
a workaround, but issue a warning.
Changed files: linkcheck/HttpUrlData.py
1.8.7
* #define YY_NO_UNISTD_H on Windows systems, fixes build error with
Visual Studio compiler

View file

@ -26,6 +26,8 @@ from ProxyUrlData import ProxyUrlData
_supported_encodings = ('gzip', 'x-gzip', 'deflate')
_isAmazonHost = re.compile(r'www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').match
class HttpUrlData (ProxyUrlData):
"Url link with http scheme"
netscape_re = re.compile("Netscape-Enterprise/")
@ -93,6 +95,10 @@ class HttpUrlData (ProxyUrlData):
self.setWarning(i18n._("Access denied by robots.txt, checked only syntax"))
return
# amazon servers suck
if _isAmazonHost(self.urlparts[1]):
self.setWarning(i18n._("Amazon servers block HTTP HEAD requests, "
"using GET instead"))
# first try
response = self._getHttpResponse()
self.headers = response.msg
@ -156,11 +162,11 @@ class HttpUrlData (ProxyUrlData):
self.headers = response.msg
elif self.headers:
type = self.headers.gettype()
poweredby = self.headers.getheader('X-Powered-By')
server = self.headers.getheader('Server')
poweredby = self.headers.get('X-Powered-By', '')
server = self.headers.get('Server', '')
if type=='application/octet-stream' and \
((poweredby and poweredby[:4]=='Zope') or \
(server and server[:4]=='Zope')):
(poweredby.startswith('Zope') or \
server.startswith('Zope')):
self.setWarning(i18n._("Zope Server cannot determine"
" MIME type with HEAD, falling back to GET"))
response = self._getHttpResponse("GET")
@ -205,6 +211,8 @@ class HttpUrlData (ProxyUrlData):
"""Put request and return (status code, status text, mime object).
host can be host:port format
"""
if _isAmazonHost(self.urlparts[1]):
method = "GET"
if self.proxy:
host = self.proxy
else: