mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-13 10:51:03 +00:00
fix the anchor fix
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@858 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
20c7f7267b
commit
fc35e1f97f
2 changed files with 19 additions and 14 deletions
|
|
@ -30,11 +30,15 @@ ExcList.extend([httplib.error,])
|
|||
|
||||
_supported_encodings = ('gzip', 'x-gzip', 'deflate')
|
||||
|
||||
_isAmazonHost = re.compile(r'www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').match
|
||||
# Amazon blocks HEAD requests at all
|
||||
_isAmazonHost = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
|
||||
# Servers not supporting HEAD request (eg returning 404 errors)
|
||||
_isBrokenHeadServer = re.compile(r'Netscape-Enterprise/').search
|
||||
# Server not supporting anchors in urls (eg returning 404 errors)
|
||||
_isBrokenAnchorServer = re.compile(r'Microsoft-IIS/').search
|
||||
|
||||
class HttpUrlData (ProxyUrlData):
|
||||
"Url link with http scheme"
|
||||
netscape_re = re.compile("Netscape-Enterprise/")
|
||||
|
||||
|
||||
def buildUrl (self):
|
||||
|
|
@ -45,7 +49,7 @@ class HttpUrlData (ProxyUrlData):
|
|||
if not self.urlparts[2]:
|
||||
self.setWarning(i18n._("URL path is empty, assuming '/' as path"))
|
||||
self.urlparts[2] = '/'
|
||||
self.url = urlparse.urlunsplit(self.urlparts[:4]+[self.anchor])
|
||||
self.url = urlparse.urlunsplit(self.urlparts)
|
||||
|
||||
|
||||
def checkConnection (self):
|
||||
|
|
@ -129,8 +133,6 @@ class HttpUrlData (ProxyUrlData):
|
|||
redirected = unquote(redirected)
|
||||
# note: urlparts has to be a list
|
||||
self.urlparts = list(urlparse.urlsplit(redirected))
|
||||
# we saved the anchor already, this one gets removed
|
||||
self.urlparts[4] = ''
|
||||
# new response data
|
||||
response = self._getHttpResponse()
|
||||
self.headers = response.msg
|
||||
|
|
@ -163,12 +165,16 @@ class HttpUrlData (ProxyUrlData):
|
|||
response = self._getHttpResponse("GET")
|
||||
self.headers = response.msg
|
||||
elif response.status>=400 and self.headers:
|
||||
server = self.headers.getheader("Server")
|
||||
if server and self.netscape_re.search(server):
|
||||
self.setWarning(i18n._("Netscape Enterprise Server"
|
||||
" with no HEAD support, falling back to GET"))
|
||||
server = self.headers.get('Server', '')
|
||||
if _isBrokenHeadServer(server):
|
||||
self.setWarning(i18n._("Server %s has no HEAD support, falling back to GET") % `server`)
|
||||
response = self._getHttpResponse("GET")
|
||||
self.headers = response.msg
|
||||
elif _isBrokenAnchorServer(server):
|
||||
self.setWarning(i18n.("Server %s has no anchor support, removing anchor from request") % `server`)
|
||||
self.urlparts[4] = ''
|
||||
response = self._getHttpResponse()
|
||||
self.headers = response.msg
|
||||
elif self.headers:
|
||||
type = self.headers.gettype()
|
||||
poweredby = self.headers.get('X-Powered-By', '')
|
||||
|
|
@ -182,7 +188,7 @@ class HttpUrlData (ProxyUrlData):
|
|||
self.headers = response.msg
|
||||
if response.status not in [301,302]: break
|
||||
|
||||
effectiveurl = urlparse.urlunsplit(self.urlparts[:4]+[self.anchor])
|
||||
effectiveurl = urlparse.urlunsplit(self.urlparts)
|
||||
if self.url != effectiveurl:
|
||||
self.setWarning(i18n._("Effective URL %s") % effectiveurl)
|
||||
self.url = effectiveurl
|
||||
|
|
|
|||
|
|
@ -186,9 +186,8 @@ class UrlData:
|
|||
% str(port))
|
||||
# set host lowercase and without userinfo
|
||||
self.urlparts[1] = host.lower()
|
||||
# safe anchor for later checking and delete it from url parts
|
||||
# safe anchor for later checking
|
||||
self.anchor = self.urlparts[4]
|
||||
self.urlparts[4] = ''
|
||||
|
||||
|
||||
def logMe (self):
|
||||
|
|
@ -314,10 +313,10 @@ class UrlData:
|
|||
if self.urlparts:
|
||||
if self.config["noanchorcaching"]:
|
||||
# removed anchor from cache key
|
||||
return urlparse.urlunsplit(self.urlparts)
|
||||
return urlparse.urlunsplit(self.urlparts[:4]+[''])
|
||||
else:
|
||||
# do not ignore anchor
|
||||
return urlparse.urlunsplit(self.urlparts[:4]+[self.anchor])
|
||||
return urlparse.urlunsplit(self.urlparts)
|
||||
return None
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue