"""Handle http links""" # Copyright (C) 2000,2001 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import urlparse, sys, time, re import httplib import Config, StringUtil, robotparser, linkcheck if Config.DebugLevel > 0: robotparser.debug = 1 from ProxyUrlData import ProxyUrlData from debuglevels import * _supported_encodings = ('gzip', 'x-gzip', 'deflate') class HttpUrlData (ProxyUrlData): "Url link with http scheme" netscape_re = re.compile("Netscape-Enterprise/") def buildUrl (self): ProxyUrlData.buildUrl(self) if not self.urlTuple[2]: self.setWarning(linkcheck._("Path is empty")) self.urlTuple = (self.urlTuple[0], self.urlTuple[1], "/", self.urlTuple[3], self.urlTuple[4], self.urlTuple[5]) self.url = urlparse.urlunparse(self.urlTuple) # resolve HTML entities self.url = StringUtil.unhtmlify(self.url) def checkConnection (self): """ Check a URL with HTTP protocol. Here is an excerpt from RFC 1945 with common response codes: The first digit of the Status-Code defines the class of response. The last two digits do not have any categorization role. There are 5 values for the first digit: o 1xx: Informational - Not used, but reserved for future use o 2xx: Success - The action was successfully received, understood, and accepted. o 3xx: Redirection - Further action must be taken in order to complete the request o 4xx: Client Error - The request contains bad syntax or cannot be fulfilled o 5xx: Server Error - The server failed to fulfill an apparently valid request The individual values of the numeric status codes defined for HTTP/1.0, and an example set of corresponding Reason-Phrase's, are presented below. The reason phrases listed here are only recommended -- they may be replaced by local equivalents without affecting the protocol. These codes are fully defined in Section 9. Status-Code = "200" ; OK | "201" ; Created | "202" ; Accepted | "204" ; No Content | "301" ; Moved Permanently | "302" ; Moved Temporarily | "304" ; Not Modified | "305" ; Use Proxy | "400" ; Bad Request | "401" ; Unauthorized | "403" ; Forbidden | "404" ; Not Found | "405" ; Method not allowed | "407" ; Proxy Authentication Required | "500" ; Internal Server Error | "501" ; Not Implemented | "502" ; Bad Gateway | "503" ; Service Unavailable | extension-code """ # set the proxy, so a 407 status after this is an error self.setProxy(self.config["proxy"].get(self.scheme)) if self.proxy: self.setInfo(linkcheck._("Using Proxy %s")%`self.proxy`) self.headers = None self.auth = None self.cookies = [] if self.config["robotstxt"] and not self.robotsTxtAllowsUrl(): self.setWarning(linkcheck._("Access denied by robots.txt, checked only syntax")) return # first try response = self._getHttpResponse() self.headers = response.msg Config.debug(BRING_IT_ON, response.status, response.reason, self.headers) has301status = 0 while 1: # proxy enforcement (overrides standard proxy) if response.status == 305 and self.headers: oldproxy = (self.proxy, self.proxyauth) self.setProxy(self.headers.getheader("Location")) self.setInfo(linkcheck._("Enforced Proxy %s")%`self.proxy`) response = self._getHttpResponse() self.headers = response.msg self.proxy, self.proxyauth = oldproxy # follow redirections tries = 0 redirected = self.urlName while response.status in [301,302] and self.headers and tries < 5: has301status = (response.status==301) newurl = self.headers.getheader("Location", self.headers.getheader("Uri", "")) redirected = urlparse.urljoin(redirected, newurl) self.urlTuple = urlparse.urlparse(redirected) response = self._getHttpResponse() self.headers = response.msg Config.debug(BRING_IT_ON, "Redirected", self.headers) tries += 1 if tries >= 5: self.setError(linkcheck._("too much redirections (>= 5)")) return # user authentication if response.status==401: if not self.auth: import base64 _user, _password = self._getUserPassword() self.auth = "Basic "+\ base64.encodestring("%s:%s" % (_user, _password)) response = self._getHttpResponse() self.headers = response.msg Config.debug(BRING_IT_ON, "Authentication", _user, "/", _password) # some servers get the HEAD request wrong: # - Netscape Enterprise Server (no HEAD implemented, 404 error) # - Hyperwave Information Server (501 error) # - Apache/1.3.14 (Unix) (500 error, http://www.rhino3d.de/) # - some advertisings (they want only GET, dont ask why ;) # - Zope server (it has to render the page to get the correct # content-type) elif response.status in [405,501,500]: # HEAD method not allowed ==> try get self.setWarning(linkcheck._("Server does not support HEAD " "request (got %d status), falling back to GET")%response.status) response = self._getHttpResponse("GET") self.headers = response.msg elif response.status>=400 and self.headers: server = self.headers.getheader("Server") if server and self.netscape_re.search(server): self.setWarning(linkcheck._("Netscape Enterprise Server" " with no HEAD support, falling back to GET")) response = self._getHttpResponse("GET") self.headers = response.msg elif self.headers: type = self.headers.gettype() poweredby = self.headers.getheader('X-Powered-By') server = self.headers.getheader('Server') if type=='application/octet-stream' and \ ((poweredby and poweredby[:4]=='Zope') or \ (server and server[:4]=='Zope')): self.setWarning(linkcheck._("Zope Server cannot determine" " MIME type with HEAD, falling back to GET")) response = self._getHttpResponse("GET") self.headers = response.msg if response.status not in [301,302]: break effectiveurl = urlparse.urlunparse(self.urlTuple) if self.url != effectiveurl: self.setWarning(linkcheck._("Effective URL %s") % effectiveurl) self.url = effectiveurl if has301status: self.setWarning(linkcheck._("HTTP 301 (moved permanent) encountered: you " "should update this link")) if self.url[-1]!='/': self.setWarning( linkcheck._("A HTTP 301 redirection occured and the url has no " "trailing / at the end. All urls which point to (home) " "directories should end with a / to avoid redirection")) # check final result if response.status >= 400: self.setError(`response.status`+" "+response.reason) else: if response.status == 204: # no content self.setWarning(response.reason) # store cookies for valid links if self.config['cookies']: for c in self.cookies: self.setInfo("Cookie: %s"%c) out = self.config.storeCookies(self.headers, self.urlTuple[1]) for h in out: self.setInfo(h) if response.status >= 200: self.setValid(`response.status`+" "+response.reason) else: self.setValid("OK") def _getHttpResponse (self, method="HEAD"): """Put request and return (status code, status text, mime object). host can be host:port format """ if self.proxy: host = self.proxy else: host = self.urlTuple[1] Config.debug(HURT_ME_PLENTY, "host", host) if self.urlConnection: self.closeConnection() self.urlConnection = self._getHTTPObject(host) if self.proxy: path = urlparse.urlunparse(self.urlTuple) else: path = urlparse.urlunparse(('', '', self.urlTuple[2], self.urlTuple[3], self.urlTuple[4], '')) self.urlConnection.putrequest(method, path, skip_host=1) self.urlConnection.putheader("Host", host) if self.auth: self.urlConnection.putheader("Authorization", self.auth) if self.proxyauth: self.urlConnection.putheader("Proxy-Authorization", self.proxyauth) if self.parentName: self.urlConnection.putheader("Referer", self.parentName) self.urlConnection.putheader("User-Agent", Config.UserAgent) self.urlConnection.putheader("Accept-Encoding", "gzip;q=1.0, deflate;q=0.9, identity;q=0.5") if self.config['cookies']: self.cookies = self.config.getCookies(self.urlTuple[1], self.urlTuple[2]) for c in self.cookies: self.urlConnection.putheader("Cookie", c) self.urlConnection.endheaders() return self.urlConnection.getresponse() def _getHTTPObject (self, host): h = httplib.HTTPConnection(host) h.set_debuglevel(Config.DebugLevel) h.connect() return h def getContent (self): if not self.has_content: self.has_content = 1 self.closeConnection() t = time.time() response = self._getHttpResponse("GET") self.headers = response.msg self.data = response.read() encoding = self.headers.get("Content-Encoding") if encoding in _supported_encodings: from cStringIO import StringIO if encoding == 'deflate': import zlib f = StringIO(zlib.decompress(self.data)) else: import gzip f = gzip.GzipFile('', 'rb', 9, StringIO(self.data)) self.data = f.read() self.downloadtime = time.time() - t return self.data def isHtml (self): if not (self.valid and self.headers): return 0 if self.headers.gettype()[:9]!="text/html": return 0 encoding = self.headers.get("Content-Encoding") if encoding and encoding not in _supported_encodings and \ encoding!='identity': self.setWarning(linkcheck._('Unsupported content encoding %s.')%\ `encoding`) return 0 return 1 def robotsTxtAllowsUrl (self): roboturl = "%s://%s/robots.txt" % self.urlTuple[0:2] Config.debug(HURT_ME_PLENTY, "robots.txt url", roboturl) Config.debug(HURT_ME_PLENTY, "url", self.url) if not self.config.robotsTxtCache_has_key(roboturl): rp = robotparser.RobotFileParser() rp.set_url(roboturl) rp.read() self.config.robotsTxtCache_set(roboturl, rp) rp = self.config.robotsTxtCache_get(roboturl) return rp.can_fetch(Config.UserAgent, self.url)