fallback to GET

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1231 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-08 14:44:46 +00:00 · 2004-01-29 21:20:28 +00:00 · 2004-01-29 21:20:28 +00:00 · 967cadaa26
commit 967cadaa26
parent 1d7091c880
1 changed files with 113 additions and 107 deletions
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -46,6 +46,8 @@ class HttpUrlData (ProxyUrlData):
 	                 parentName=parentName, baseRef=baseRef, line=line,
 		         column=column, name=name)
        self.aliases = []
+        self.max_redirects = 5
+        self.has301status = False


    def buildUrl (self):
@ -111,17 +113,19 @@ class HttpUrlData (ProxyUrlData):
            self.setWarning(i18n._("Access denied by robots.txt, checked only syntax"))
            return

-        # amazon servers suck
        if _isAmazonHost(self.urlparts[1]):
            self.setWarning(i18n._("Amazon servers block HTTP HEAD requests, "
                                   "using GET instead"))
-        # first try
+            self.method = "GET"
+        else:
+            # first try with HEAD
+            self.method = "HEAD"
+        fallback = False
        redirectCache = [self.url]
-        response = self._getHttpResponse()
-        self.headers = response.msg
-        debug(BRING_IT_ON, response.status, response.reason, self.headers)
-        has301status = False
-        while 1:
+        while True:
+            response = self._getHttpResponse()
+            self.headers = response.msg
+            debug(BRING_IT_ON, response.status, response.reason, self.headers)
            # proxy enforcement (overrides standard proxy)
            if response.status == 305 and self.headers:
                oldproxy = (self.proxy, self.proxyauth)
@ -130,126 +134,129 @@ class HttpUrlData (ProxyUrlData):
                response = self._getHttpResponse()
                self.headers = response.msg
                self.proxy, self.proxyauth = oldproxy
-            # follow redirections
-            tries = 0
-            redirected = self.url
-            while response.status in [301,302] and self.headers and tries < 5:
-                newurl = self.headers.getheader("Location",
-                             self.headers.getheader("Uri", ""))
-                redirected = unquote(urlparse.urljoin(redirected, newurl))
-                # note: urlparts has to be a list
-                self.urlparts = list(urlparse.urlsplit(redirected))
-                # check internal redirect cache to avoid recursion
-                if redirected in redirectCache:
-                    redirectCache.append(redirected)
-                    self.setError(
-                         i18n._("recursive redirection encountered:\n %s") % \
-                                "\n  => ".join(redirectCache))
-                    return
-                redirectCache.append(redirected)
-                # remember this alias
-                if response.status == 301:
-                    if not has301status:
-                        self.setWarning(i18n._("HTTP 301 (moved permanent) encountered: you "
-                                               "should update this link."))
-                        if not (self.url.endswith('/') or self.url.endswith('.html')):
-                            self.setWarning(i18n._("A HTTP 301 redirection occured and the url has no "
-                                                   "trailing / at the end. All urls which point to (home) "
-                                                   "directories should end with a / to avoid redirection."))
-                        has301status = True
-                    self.aliases.append(redirected)
-                # check cache again on possibly changed URL
-                key = self.getCacheKey()
-                if self.config.urlCache_has_key(key):
-                    self.copyFromCache(self.config.urlCache_get(key))
-                    self.cached = True
-                    self.logMe()
-                    return
-                # check if we still have a http url, it could be another
-                # scheme, eg https or news
-                if self.urlparts[0]!="http":
-                    self.setWarning(i18n._("HTTP redirection to non-http url encountered; "
-                                    "the original url was %r.")%self.url)
-                    # make new UrlData object
-                    newobj = GetUrlDataFrom(redirected, self.recursionLevel, self.config,
-                                            parentName=self.parentName, baseRef=self.baseRef,
-                                            line=self.line, column=self.column, name=self.name)
-                    newobj.warningString = self.warningString
-                    newobj.infoString = self.infoString
-                    # append new object to queue
-                    self.config.appendUrl(newobj)
-                    # pretend to be finished and logged
-                    self.cached = True
-                    return
-                # new response data
-                response = self._getHttpResponse()
-                self.headers = response.msg
-                debug(BRING_IT_ON, "Redirected", self.headers)
-                tries += 1
-            if tries >= 5:
-                self.setError(i18n._("more than five redirections, aborting"))
+            # follow all redirections
+            tries, response = self.followRedirections(response)
+            if tries == -1:
+                # already handled
+                return
+            if tries >= self.max_redirects:
+                self.setError(i18n._("more than %d redirections, aborting")%self.max_redirections)
                return
            # user authentication
-            if response.status==401:
+            if response.status == 401:
 	        if not self.auth:
                    import base64
                    _user, _password = self.getUserPassword()
                    self.auth = "Basic "+\
                        base64.encodestring("%s:%s" % (_user, _password))
-                response = self._getHttpResponse()
-                self.headers = response.msg
-                debug(BRING_IT_ON, "Authentication", _user, "/", _password)
-            # some servers get the HEAD request wrong:
-            # - Netscape Enterprise Server (no HEAD implemented, 404 error)
-            # - Hyperwave Information Server (501 error)
-            # - Apache/1.3.14 (Unix) (500 error, http://www.rhino3d.de/)
-            # - some advertisings (they want only GET, dont ask why ;)
-            # - Zope server (it has to render the page to get the correct
-            #   content-type)
-            elif response.status in [405,501,500]:
-                # HEAD method not allowed ==> try get
-                self.setWarning(i18n._("Server does not support HEAD "
-             "request (got %d status), falling back to GET")%response.status)
-                response = self._getHttpResponse("GET")
-                self.headers = response.msg
-            elif response.status>=400 and self.headers:
-                server = self.headers.get('Server', '')
-                if _isBrokenHeadServer(server):
-                    self.setWarning(i18n._("Server %r has no HEAD support, falling back to GET")%server)
-                    response = self._getHttpResponse("GET")
-                    self.headers = response.msg
-                elif _isBrokenAnchorServer(server):
-                    self.setWarning(i18n._("Server %r has no anchor support, removing anchor from request")%server)
-                    self.urlparts[4] = ''
-                    response = self._getHttpResponse()
-                    self.headers = response.msg
-            elif self.headers:
-                type = self.headers.gettype()
+                    debug(BRING_IT_ON, "Authentication", _user, "/", _password)
+                continue
+            elif response.status >= 400:
+                if self.headers:
+                    # test for anchor support
+                    server = self.headers.get('Server', '')
+                    if _isBrokenAnchorServer(server) and self.urlparts[4]:
+                        self.setWarning(i18n._("Server %r has no anchor support, removing anchor from request")%server)
+                        self.urlparts[4] = ''
+                        continue
+                if self.method=="HEAD":
+                    # fall back to GET
+                    self.method = "GET"
+                    fallback = True
+                    continue
+            elif self.headers and self.method!="GET":
+                # test for HEAD support
+                mime = self.headers.gettype()
                poweredby = self.headers.get('X-Powered-By', '')
                server = self.headers.get('Server', '')
-                if type=='application/octet-stream' and \
+                if mime=='application/octet-stream' and \
                   (poweredby.startswith('Zope') or \
                    server.startswith('Zope')):
                    self.setWarning(i18n._("Zope Server cannot determine"
                                " MIME type with HEAD, falling back to GET"))
-                    response = self._getHttpResponse("GET")
-                    self.headers = response.msg
-            if response.status not in [301,302]: break
-
+                    continue
+            break
        # check url warnings
        effectiveurl = urlparse.urlunsplit(self.urlparts)
        if self.url != effectiveurl:
            self.setWarning(i18n._("Effective URL %s") % effectiveurl)
            self.url = effectiveurl
        # check response
-        self.checkResponse(response)
+        self.checkResponse(response, fallback)


-    def checkResponse (self, response):
+    def followRedirections (self, response):
+        """follow all redirections of http response"""
+        redirected = self.url
+        tries = 0
+        while response.status in [301,302] and self.headers and \
+              tries < self.max_redirects:
+            newurl = self.headers.getheader("Location",
+                         self.headers.getheader("Uri", ""))
+            redirected = unquote(urlparse.urljoin(redirected, newurl))
+            # note: urlparts has to be a list
+            self.urlparts = list(urlparse.urlsplit(redirected))
+            # check internal redirect cache to avoid recursion
+            if redirected in redirectCache:
+                redirectCache.append(redirected)
+                self.setError(
+                     i18n._("recursive redirection encountered:\n %s") % \
+                            "\n  => ".join(redirectCache))
+                return -1, response
+            redirectCache.append(redirected)
+            # remember this alias
+            if response.status == 301:
+                if not self.has301status:
+                    self.setWarning(i18n._("HTTP 301 (moved permanent) encountered: you "
+                                           "should update this link."))
+                    if not (self.url.endswith('/') or self.url.endswith('.html')):
+                        self.setWarning(i18n._("A HTTP 301 redirection occured and the url has no "
+                                               "trailing / at the end. All urls which point to (home) "
+                                               "directories should end with a / to avoid redirection."))
+                    self.has301status = True
+                self.aliases.append(redirected)
+            # check cache again on possibly changed URL
+            key = self.getCacheKey()
+            if self.config.urlCache_has_key(key):
+                self.copyFromCache(self.config.urlCache_get(key))
+                self.cached = True
+                self.logMe()
+                return -1, reponse
+            # check if we still have a http url, it could be another
+            # scheme, eg https or news
+            if self.urlparts[0]!="http":
+                self.setWarning(i18n._("HTTP redirection to non-http url encountered; "
+                                "the original url was %r.")%self.url)
+                # make new UrlData object
+                newobj = GetUrlDataFrom(redirected, self.recursionLevel, self.config,
+                                        parentName=self.parentName, baseRef=self.baseRef,
+                                        line=self.line, column=self.column, name=self.name)
+                newobj.warningString = self.warningString
+                newobj.infoString = self.infoString
+                # append new object to queue
+                self.config.appendUrl(newobj)
+                # pretend to be finished and logged
+                self.cached = True
+                return -1, response
+            # new response data
+            response = self._getHttpResponse()
+            self.headers = response.msg
+            debug(BRING_IT_ON, "Redirected", self.headers)
+            tries += 1
+        return tries, response
+
+
+    def checkResponse (self, response, fallback):
        """check final result"""
        if response.status >= 400:
            self.setError("%r %s"%(response.status, response.reason))
        else:
+            if fallback:
+                if self.headers and self.headers.has_key("Server"):
+                    server = self.headers['Server']
+                else:
+                    server = i18n._("unknown")
+                self.setWarning(i18n._("Server %r did not support HEAD request, used GET for checking")%server)
            if response.status == 204:
                # no content
                self.setWarning(response.reason)
@ -275,12 +282,10 @@ class HttpUrlData (ProxyUrlData):
        return keys


-    def _getHttpResponse (self, method="HEAD"):
+    def _getHttpResponse (self):
        """Put request and return (status code, status text, mime object).
           host can be host:port format
 	"""
-        if _isAmazonHost(self.urlparts[1]):
-            method = "GET"
        if self.proxy:
            host = self.proxy
            scheme = "http"
@ -299,7 +304,7 @@ class HttpUrlData (ProxyUrlData):
        else:
            path = urlparse.urlunsplit(('', '', qurlparts[2],
            qurlparts[3], qurlparts[4]))
-        self.urlConnection.putrequest(method, path, skip_host=True)
+        self.urlConnection.putrequest(self.method, path, skip_host=True)
        self.urlConnection.putheader("Host", host)
        # userinfo is from http://user@pass:host/
        if self.userinfo:
@ -337,10 +342,11 @@ class HttpUrlData (ProxyUrlData):

    def getContent (self):
        if not self.has_content:
+            self.method = "GET"
            self.has_content = True
            self.closeConnection()
            t = time.time()
-            response = self._getHttpResponse("GET")
+            response = self._getHttpResponse()
            self.headers = response.msg
            self.data = response.read()
            encoding = self.headers.get("Content-Encoding")