# -*- coding: iso-8859-1 -*- """Handle http links""" # Copyright (C) 2000-2004 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import urlparse import sys import time import re import zlib import gzip import socket import cStringIO as StringIO import linkcheck from linkcheck.debug import * supportHttps = hasattr(linkcheck.httplib2, "HTTPSConnection") and \ hasattr(socket, "ssl") linkcheck.UrlData.ExcList.extend([linkcheck.httplib2.error,]) _supported_encodings = ('gzip', 'x-gzip', 'deflate') # Amazon blocks all HEAD requests _isAmazonHost = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search class HttpUrlData (linkcheck.ProxyUrlData.ProxyUrlData): "Url link with http scheme" def __init__ (self, urlName, recursionLevel, config, parentName=None, baseRef=None, line=0, column=0, name=""): super(HttpUrlData, self).__init__(urlName, recursionLevel, config, parentName=parentName, baseRef=baseRef, line=line, column=column, name=name) self.aliases = [] self.max_redirects = 5 self.has301status = False self.no_anchor = False # remove anchor in request url def buildUrl (self): super(HttpUrlData, self).buildUrl() # encode userinfo # XXX # check for empty paths if not self.urlparts[2]: self.setWarning(linkcheck.i18n._("URL path is empty, assuming '/' as path")) self.urlparts[2] = '/' self.url = urlparse.urlunsplit(self.urlparts) def checkConnection (self): """ Check a URL with HTTP protocol. Here is an excerpt from RFC 1945 with common response codes: The first digit of the Status-Code defines the class of response. The last two digits do not have any categorization role. There are 5 values for the first digit: o 1xx: Informational - Not used, but reserved for future use o 2xx: Success - The action was successfully received, understood, and accepted. o 3xx: Redirection - Further action must be taken in order to complete the request o 4xx: Client Error - The request contains bad syntax or cannot be fulfilled o 5xx: Server Error - The server failed to fulfill an apparently valid request The individual values of the numeric status codes defined for HTTP/1.0, and an example set of corresponding Reason-Phrase's, are presented below. The reason phrases listed here are only recommended -- they may be replaced by local equivalents without affecting the protocol. These codes are fully defined in Section 9. Status-Code = "200" ; OK | "201" ; Created | "202" ; Accepted | "204" ; No Content | "301" ; Moved Permanently | "302" ; Moved Temporarily | "304" ; Not Modified | "305" ; Use Proxy | "400" ; Bad Request | "401" ; Unauthorized | "403" ; Forbidden | "404" ; Not Found | "405" ; Method not allowed | "407" ; Proxy Authentication Required | "500" ; Internal Server Error | "501" ; Not Implemented | "502" ; Bad Gateway | "503" ; Service Unavailable | extension-code """ # set the proxy, so a 407 status after this is an error self.setProxy(self.config["proxy"].get(self.scheme)) if self.proxy: self.setInfo(linkcheck.i18n._("Using Proxy %r")%self.proxy) self.headers = None self.auth = None self.cookies = [] if not self.robotsTxtAllowsUrl(): self.setWarning(linkcheck.i18n._("Access denied by robots.txt, checked only syntax")) return if _isAmazonHost(self.urlparts[1]): self.setWarning(linkcheck.i18n._("Amazon servers block HTTP HEAD requests, " "using GET instead")) self.method = "GET" else: # first try with HEAD self.method = "HEAD" fallback_GET = False redirectCache = [self.url] while True: try: response = self._getHttpResponse() except linkcheck.httplib2.BadStatusLine: # some servers send empty HEAD replies if self.method=="HEAD": self.method = "GET" redirectCache = [self.url] fallback_GET = True continue raise self.headers = response.msg debug(BRING_IT_ON, response.status, response.reason, self.headers) # proxy enforcement (overrides standard proxy) if response.status == 305 and self.headers: oldproxy = (self.proxy, self.proxyauth) self.setProxy(self.headers.getheader("Location")) self.setInfo(linkcheck.i18n._("Enforced Proxy %r")%self.proxy) response = self._getHttpResponse() self.headers = response.msg self.proxy, self.proxyauth = oldproxy # follow all redirections tries, response = self.followRedirections(response, redirectCache) if tries == -1: # already handled return if tries >= self.max_redirects: if self.method=="HEAD": # Microsoft servers tend to recurse HEAD requests self.method = "GET" redirectCache = [self.url] fallback_GET = True continue self.setError(linkcheck.i18n._("more than %d redirections, aborting")%self.max_redirects) return # user authentication if response.status == 401: if not self.auth: import base64 _user, _password = self.getUserPassword() self.auth = "Basic "+\ base64.encodestring("%s:%s" % (_user, _password)) debug(BRING_IT_ON, "Authentication", _user, "/", _password) continue elif response.status >= 400: if self.headers and self.urlparts[4]: self.no_anchor = True continue if self.method=="HEAD": # fall back to GET self.method = "GET" redirectCache = [self.url] fallback_GET = True continue elif self.headers and self.method!="GET": # test for HEAD support mime = self.headers.gettype() poweredby = self.headers.get('X-Powered-By', '') server = self.headers.get('Server', '') if mime=='application/octet-stream' and \ (poweredby.startswith('Zope') or \ server.startswith('Zope')): self.setWarning(linkcheck.i18n._("Zope Server cannot determine" " MIME type with HEAD, falling back to GET")) self.method = "GET" continue break # check url warnings effectiveurl = urlparse.urlunsplit(self.urlparts) if self.url != effectiveurl: self.setWarning(linkcheck.i18n._("Effective URL %s") % effectiveurl) self.url = effectiveurl # check response self.checkResponse(response, fallback_GET) def followRedirections (self, response, redirectCache): """follow all redirections of http response""" redirected = self.url tries = 0 while response.status in [301,302] and self.headers and \ tries < self.max_redirects: newurl = self.headers.getheader("Location", self.headers.getheader("Uri", "")) redirected = linkcheck.url.url_norm(urlparse.urljoin(redirected, newurl)) # note: urlparts has to be a list self.urlparts = list(urlparse.urlsplit(redirected)) # check internal redirect cache to avoid recursion if redirected in redirectCache: redirectCache.append(redirected) if self.method == "HEAD": # Microsoft servers tend to recurse HEAD requests # fall back to the original url and use GET self.urlparts = list(urlparse.urlsplit(self.url)) return self.max_redirects, response self.setError( linkcheck.i18n._("recursive redirection encountered:\n %s") % \ "\n => ".join(redirectCache)) return -1, response redirectCache.append(redirected) # remember this alias if response.status == 301: if not self.has301status: self.setWarning(linkcheck.i18n._("HTTP 301 (moved permanent) encountered: you " "should update this link.")) if not (self.url.endswith('/') or self.url.endswith('.html')): self.setWarning(linkcheck.i18n._("A HTTP 301 redirection occured and the url has no " "trailing / at the end. All urls which point to (home) " "directories should end with a / to avoid redirection.")) self.has301status = True self.aliases.append(redirected) # check cache again on possibly changed URL key = self.getCacheKey() if self.config.urlCache_has_key(key): self.copyFromCache(self.config.urlCache_get(key)) self.cached = True self.logMe() return -1, response # check if we still have a http url, it could be another # scheme, eg https or news if self.urlparts[0]!="http": self.setWarning(linkcheck.i18n._("HTTP redirection to non-http url encountered; " "the original url was %r.")%self.url) # make new UrlData object newobj = linkcheck.UrlData.GetUrlDataFrom(redirected, self.recursionLevel, self.config, parentName=self.parentName, baseRef=self.baseRef, line=self.line, column=self.column, name=self.name) newobj.warningString = self.warningString newobj.infoString = self.infoString # append new object to queue self.config.appendUrl(newobj) # pretend to be finished and logged self.cached = True return -1, response # new response data response = self._getHttpResponse() self.headers = response.msg debug(BRING_IT_ON, "Redirected", self.headers) tries += 1 return tries, response def checkResponse (self, response, fallback_GET): """check final result""" if response.status >= 400: self.setError("%r %s"%(response.status, response.reason)) else: if self.headers and self.headers.has_key("Server"): server = self.headers['Server'] else: server = linkcheck.i18n._("unknown") if fallback_GET: self.setWarning(linkcheck.i18n._("Server %r did not support HEAD request, used GET for checking")%server) if self.no_anchor: self.setWarning(linkcheck.i18n._("Server %r had no anchor support, removed anchor from request")%server) if response.status == 204: # no content self.setWarning(response.reason) # store cookies for valid links if self.config['cookies']: for c in self.cookies: self.setInfo("Cookie: %s"%c) out = self.config.storeCookies(self.headers, self.urlparts[1]) for h in out: self.setInfo(h) if response.status >= 200: self.setValid("%r %s"%(response.status,response.reason)) else: self.setValid("OK") modified = self.headers.get('Last-Modified', '') if modified: self.setInfo(linkcheck.i18n._("Last modified %s") % modified) def getCacheKeys (self): keys = super(HttpUrlData, self).getCacheKeys() keys.extend(self.aliases) return keys def _getHttpResponse (self): """Put request and return (status code, status text, mime object). host can be host:port format """ if self.proxy: host = self.proxy scheme = "http" else: host = self.urlparts[1] scheme = self.urlparts[0] debug(HURT_ME_PLENTY, "host", host) if self.urlConnection: self.closeConnection() self.urlConnection = self.getHTTPObject(host, scheme) # quote url before submit url = linkcheck.url.url_quote(urlparse.urlunsplit(self.urlparts)) qurlparts = list(urlparse.urlsplit(url)) if self.no_anchor: qurlparts[4] = '' if self.proxy: path = urlparse.urlunsplit(qurlparts) else: path = urlparse.urlunsplit(('', '', qurlparts[2], qurlparts[3], qurlparts[4])) self.urlConnection.putrequest(self.method, path, skip_host=True) self.urlConnection.putheader("Host", host) # userinfo is from http://user@pass:host/ if self.userinfo: self.urlConnection.putheader("Authorization", self.userinfo) # auth is the -u and -p configuration options elif self.auth: self.urlConnection.putheader("Authorization", self.auth) if self.proxyauth: self.urlConnection.putheader("Proxy-Authorization", self.proxyauth) if self.parentName: self.urlConnection.putheader("Referer", self.parentName) self.urlConnection.putheader("User-Agent", linkcheck.Config.UserAgent) self.urlConnection.putheader("Accept-Encoding", "gzip;q=1.0, deflate;q=0.9, identity;q=0.5") if self.config['cookies']: self.cookies = self.config.getCookies(self.urlparts[1], self.urlparts[2]) for c in self.cookies: self.urlConnection.putheader("Cookie", c) self.urlConnection.endheaders() return self.urlConnection.getresponse() def getHTTPObject (self, host, scheme): if scheme=="http": h = linkcheck.httplib2.HTTPConnection(host) elif scheme=="https": h = linkcheck.httplib2.HTTPSConnection(host) else: raise linkcheck.LinkCheckerError, "invalid url scheme %s" % scheme h.set_debuglevel(get_debuglevel()) h.connect() return h def getContent (self): if not self.has_content: self.method = "GET" self.has_content = True self.closeConnection() t = time.time() response = self._getHttpResponse() self.headers = response.msg self.data = response.read() encoding = self.headers.get("Content-Encoding") if encoding in _supported_encodings: try: if encoding == 'deflate': f = StringIO.StringIO(zlib.decompress(self.data)) else: f = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(self.data)) except zlib.error: f = StringIO.StringIO(self.data) self.data = f.read() self.downloadtime = time.time() - t return self.data def isHtml (self): if not (self.valid and self.headers): return False if self.headers.gettype()[:9]!="text/html": return False encoding = self.headers.get("Content-Encoding") if encoding and encoding not in _supported_encodings and \ encoding!='identity': self.setWarning(linkcheck.i18n._('Unsupported content encoding %r.')%encoding) return False return True def isHttp (self): return True def getContentType (self): ptype = self.headers.get('Content-Type', 'application/octet-stream') if ";" in ptype: ptype = ptype.split(';')[0] return ptype def isParseable (self): if not (self.valid and self.headers): return False if self.getContentType() not in ("text/html", "text/css"): return False encoding = self.headers.get("Content-Encoding") if encoding and encoding not in _supported_encodings and \ encoding!='identity': self.setWarning(linkcheck.i18n._('Unsupported content encoding %r.')%encoding) return False return True def parseUrl (self): ptype = self.getContentType() if ptype=="text/html": self.parse_html() elif ptype=="text/css": self.parse_css() return None def getRobotsTxtUrl (self): return "%s://%s/robots.txt"%tuple(self.urlparts[0:2]) def robotsTxtAllowsUrl (self): roboturl = self.getRobotsTxtUrl() debug(HURT_ME_PLENTY, "robots.txt url", roboturl) debug(HURT_ME_PLENTY, "url", self.url) if not self.config.robotsTxtCache_has_key(roboturl): rp = linkcheck.robotparser2.RobotFileParser() rp.set_url(roboturl) rp.read() self.config.robotsTxtCache_set(roboturl, rp) rp = self.config.robotsTxtCache_get(roboturl) return rp.can_fetch(linkcheck.Config.UserAgent, self.url)