fix the anchor fix

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@858 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2003-04-29 09:40:33 +00:00
parent 20c7f7267b
commit fc35e1f97f
2 changed files with 19 additions and 14 deletions

View file

@ -30,11 +30,15 @@ ExcList.extend([httplib.error,])
_supported_encodings = ('gzip', 'x-gzip', 'deflate')
_isAmazonHost = re.compile(r'www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').match
# Amazon blocks HEAD requests at all
_isAmazonHost = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
# Servers not supporting HEAD request (eg returning 404 errors)
_isBrokenHeadServer = re.compile(r'Netscape-Enterprise/').search
# Server not supporting anchors in urls (eg returning 404 errors)
_isBrokenAnchorServer = re.compile(r'Microsoft-IIS/').search
class HttpUrlData (ProxyUrlData):
"Url link with http scheme"
netscape_re = re.compile("Netscape-Enterprise/")
def buildUrl (self):
@ -45,7 +49,7 @@ class HttpUrlData (ProxyUrlData):
if not self.urlparts[2]:
self.setWarning(i18n._("URL path is empty, assuming '/' as path"))
self.urlparts[2] = '/'
self.url = urlparse.urlunsplit(self.urlparts[:4]+[self.anchor])
self.url = urlparse.urlunsplit(self.urlparts)
def checkConnection (self):
@ -129,8 +133,6 @@ class HttpUrlData (ProxyUrlData):
redirected = unquote(redirected)
# note: urlparts has to be a list
self.urlparts = list(urlparse.urlsplit(redirected))
# we saved the anchor already, this one gets removed
self.urlparts[4] = ''
# new response data
response = self._getHttpResponse()
self.headers = response.msg
@ -163,12 +165,16 @@ class HttpUrlData (ProxyUrlData):
response = self._getHttpResponse("GET")
self.headers = response.msg
elif response.status>=400 and self.headers:
server = self.headers.getheader("Server")
if server and self.netscape_re.search(server):
self.setWarning(i18n._("Netscape Enterprise Server"
" with no HEAD support, falling back to GET"))
server = self.headers.get('Server', '')
if _isBrokenHeadServer(server):
self.setWarning(i18n._("Server %s has no HEAD support, falling back to GET") % `server`)
response = self._getHttpResponse("GET")
self.headers = response.msg
elif _isBrokenAnchorServer(server):
self.setWarning(i18n.("Server %s has no anchor support, removing anchor from request") % `server`)
self.urlparts[4] = ''
response = self._getHttpResponse()
self.headers = response.msg
elif self.headers:
type = self.headers.gettype()
poweredby = self.headers.get('X-Powered-By', '')
@ -182,7 +188,7 @@ class HttpUrlData (ProxyUrlData):
self.headers = response.msg
if response.status not in [301,302]: break
effectiveurl = urlparse.urlunsplit(self.urlparts[:4]+[self.anchor])
effectiveurl = urlparse.urlunsplit(self.urlparts)
if self.url != effectiveurl:
self.setWarning(i18n._("Effective URL %s") % effectiveurl)
self.url = effectiveurl

View file

@ -186,9 +186,8 @@ class UrlData:
% str(port))
# set host lowercase and without userinfo
self.urlparts[1] = host.lower()
# safe anchor for later checking and delete it from url parts
# safe anchor for later checking
self.anchor = self.urlparts[4]
self.urlparts[4] = ''
def logMe (self):
@ -314,10 +313,10 @@ class UrlData:
if self.urlparts:
if self.config["noanchorcaching"]:
# removed anchor from cache key
return urlparse.urlunsplit(self.urlparts)
return urlparse.urlunsplit(self.urlparts[:4]+[''])
else:
# do not ignore anchor
return urlparse.urlunsplit(self.urlparts[:4]+[self.anchor])
return urlparse.urlunsplit(self.urlparts)
return None