linkchecker/linkcheck/HttpUrlData.py
2002-11-27 14:23:01 +00:00

290 lines
13 KiB
Python

"""Handle http links"""
# Copyright (C) 2000,2001 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import urlparse, sys, time, re
import httplib
import Config, StringUtil, robotparser, linkcheck
if Config.DebugLevel > 0:
robotparser.debug = 1
from ProxyUrlData import ProxyUrlData
from debuglevels import *
_supported_encodings = ('gzip', 'x-gzip', 'deflate')
class HttpUrlData (ProxyUrlData):
"Url link with http scheme"
netscape_re = re.compile("Netscape-Enterprise/")
def buildUrl (self):
ProxyUrlData.buildUrl(self)
if not self.urlTuple[2]:
self.setWarning(linkcheck._("Path is empty"))
self.urlTuple = (self.urlTuple[0], self.urlTuple[1], "/",
self.urlTuple[3], self.urlTuple[4], self.urlTuple[5])
self.url = urlparse.urlunparse(self.urlTuple)
# resolve HTML entities
self.url = StringUtil.unhtmlify(self.url)
def checkConnection (self):
"""
Check a URL with HTTP protocol.
Here is an excerpt from RFC 1945 with common response codes:
The first digit of the Status-Code defines the class of response. The
last two digits do not have any categorization role. There are 5
values for the first digit:
o 1xx: Informational - Not used, but reserved for future use
o 2xx: Success - The action was successfully received,
understood, and accepted.
o 3xx: Redirection - Further action must be taken in order to
complete the request
o 4xx: Client Error - The request contains bad syntax or cannot
be fulfilled
o 5xx: Server Error - The server failed to fulfill an apparently
valid request
The individual values of the numeric status codes defined for
HTTP/1.0, and an example set of corresponding Reason-Phrase's, are
presented below. The reason phrases listed here are only recommended
-- they may be replaced by local equivalents without affecting the
protocol. These codes are fully defined in Section 9.
Status-Code = "200" ; OK
| "201" ; Created
| "202" ; Accepted
| "204" ; No Content
| "301" ; Moved Permanently
| "302" ; Moved Temporarily
| "304" ; Not Modified
| "305" ; Use Proxy
| "400" ; Bad Request
| "401" ; Unauthorized
| "403" ; Forbidden
| "404" ; Not Found
| "405" ; Method not allowed
| "407" ; Proxy Authentication Required
| "500" ; Internal Server Error
| "501" ; Not Implemented
| "502" ; Bad Gateway
| "503" ; Service Unavailable
| extension-code
"""
# set the proxy, so a 407 status after this is an error
self.setProxy(self.config["proxy"].get(self.scheme))
if self.proxy:
self.setInfo(linkcheck._("Using Proxy %s")%`self.proxy`)
self.headers = None
self.auth = None
self.cookies = []
if self.config["robotstxt"] and not self.robotsTxtAllowsUrl():
self.setWarning(linkcheck._("Access denied by robots.txt, checked only syntax"))
return
# first try
response = self._getHttpResponse()
self.headers = response.msg
Config.debug(BRING_IT_ON, response.status, response.reason, self.headers)
has301status = 0
while 1:
# proxy enforcement (overrides standard proxy)
if response.status == 305 and self.headers:
oldproxy = (self.proxy, self.proxyauth)
self.setProxy(self.headers.getheader("Location"))
self.setInfo(linkcheck._("Enforced Proxy %s")%`self.proxy`)
response = self._getHttpResponse()
self.headers = response.msg
self.proxy, self.proxyauth = oldproxy
# follow redirections
tries = 0
redirected = self.urlName
while response.status in [301,302] and self.headers and tries < 5:
has301status = (response.status==301)
newurl = self.headers.getheader("Location",
self.headers.getheader("Uri", ""))
redirected = urlparse.urljoin(redirected, newurl)
self.urlTuple = urlparse.urlparse(redirected)
response = self._getHttpResponse()
self.headers = response.msg
Config.debug(BRING_IT_ON, "Redirected", self.headers)
tries += 1
if tries >= 5:
self.setError(linkcheck._("too much redirections (>= 5)"))
return
# user authentication
if response.status==401:
if not self.auth:
import base64
_user, _password = self._getUserPassword()
self.auth = "Basic "+\
base64.encodestring("%s:%s" % (_user, _password))
response = self._getHttpResponse()
self.headers = response.msg
Config.debug(BRING_IT_ON, "Authentication", _user, "/",
_password)
# some servers get the HEAD request wrong:
# - Netscape Enterprise Server (no HEAD implemented, 404 error)
# - Hyperwave Information Server (501 error)
# - Apache/1.3.14 (Unix) (500 error, http://www.rhino3d.de/)
# - some advertisings (they want only GET, dont ask why ;)
# - Zope server (it has to render the page to get the correct
# content-type)
elif response.status in [405,501,500]:
# HEAD method not allowed ==> try get
self.setWarning(linkcheck._("Server does not support HEAD "
"request (got %d status), falling back to GET")%response.status)
response = self._getHttpResponse("GET")
self.headers = response.msg
elif response.status>=400 and self.headers:
server = self.headers.getheader("Server")
if server and self.netscape_re.search(server):
self.setWarning(linkcheck._("Netscape Enterprise Server"
" with no HEAD support, falling back to GET"))
response = self._getHttpResponse("GET")
self.headers = response.msg
elif self.headers:
type = self.headers.gettype()
poweredby = self.headers.getheader('X-Powered-By')
server = self.headers.getheader('Server')
if type=='application/octet-stream' and \
((poweredby and poweredby[:4]=='Zope') or \
(server and server[:4]=='Zope')):
self.setWarning(linkcheck._("Zope Server cannot determine"
" MIME type with HEAD, falling back to GET"))
response = self._getHttpResponse("GET")
self.headers = response.msg
if response.status not in [301,302]: break
effectiveurl = urlparse.urlunparse(self.urlTuple)
if self.url != effectiveurl:
self.setWarning(linkcheck._("Effective URL %s") % effectiveurl)
self.url = effectiveurl
if has301status:
self.setWarning(linkcheck._("HTTP 301 (moved permanent) encountered: you "
"should update this link"))
if self.url[-1]!='/':
self.setWarning(
linkcheck._("A HTTP 301 redirection occured and the url has no "
"trailing / at the end. All urls which point to (home) "
"directories should end with a / to avoid redirection"))
# check final result
if response.status >= 400:
self.setError(`response.status`+" "+response.reason)
else:
if response.status == 204:
# no content
self.setWarning(response.reason)
# store cookies for valid links
if self.config['cookies']:
for c in self.cookies:
self.setInfo("Cookie: %s"%c)
out = self.config.storeCookies(self.headers, self.urlTuple[1])
for h in out:
self.setInfo(h)
if response.status >= 200:
self.setValid(`response.status`+" "+response.reason)
else:
self.setValid("OK")
def _getHttpResponse (self, method="HEAD"):
"""Put request and return (status code, status text, mime object).
host can be host:port format
"""
if self.proxy:
host = self.proxy
else:
host = self.urlTuple[1]
Config.debug(HURT_ME_PLENTY, "host", host)
if self.urlConnection:
self.closeConnection()
self.urlConnection = self._getHTTPObject(host)
if self.proxy:
path = urlparse.urlunparse(self.urlTuple)
else:
path = urlparse.urlunparse(('', '', self.urlTuple[2],
self.urlTuple[3], self.urlTuple[4], ''))
self.urlConnection.putrequest(method, path, skip_host=1)
self.urlConnection.putheader("Host", host)
if self.auth:
self.urlConnection.putheader("Authorization", self.auth)
if self.proxyauth:
self.urlConnection.putheader("Proxy-Authorization",
self.proxyauth)
if self.parentName:
self.urlConnection.putheader("Referer", self.parentName)
self.urlConnection.putheader("User-Agent", Config.UserAgent)
self.urlConnection.putheader("Accept-Encoding", "gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
if self.config['cookies']:
self.cookies = self.config.getCookies(self.urlTuple[1],
self.urlTuple[2])
for c in self.cookies:
self.urlConnection.putheader("Cookie", c)
self.urlConnection.endheaders()
return self.urlConnection.getresponse()
def _getHTTPObject (self, host):
h = httplib.HTTPConnection(host)
h.set_debuglevel(Config.DebugLevel)
h.connect()
return h
def getContent (self):
if not self.has_content:
self.has_content = 1
self.closeConnection()
t = time.time()
response = self._getHttpResponse("GET")
self.headers = response.msg
self.data = response.read()
encoding = self.headers.get("Content-Encoding")
if encoding in _supported_encodings:
from cStringIO import StringIO
if encoding == 'deflate':
import zlib
f = StringIO(zlib.decompress(self.data))
else:
import gzip
f = gzip.GzipFile('', 'rb', 9, StringIO(self.data))
self.data = f.read()
self.downloadtime = time.time() - t
return self.data
def isHtml (self):
if not (self.valid and self.headers):
return 0
if self.headers.gettype()[:9]!="text/html":
return 0
encoding = self.headers.get("Content-Encoding")
if encoding and encoding not in _supported_encodings and \
encoding!='identity':
self.setWarning(linkcheck._('Unsupported content encoding %s.')%\
`encoding`)
return 0
return 1
def robotsTxtAllowsUrl (self):
roboturl = "%s://%s/robots.txt" % self.urlTuple[0:2]
Config.debug(HURT_ME_PLENTY, "robots.txt url", roboturl)
Config.debug(HURT_ME_PLENTY, "url", self.url)
if not self.config.robotsTxtCache_has_key(roboturl):
rp = robotparser.RobotFileParser()
rp.set_url(roboturl)
rp.read()
self.config.robotsTxtCache_set(roboturl, rp)
rp = self.config.robotsTxtCache_get(roboturl)
return rp.can_fetch(Config.UserAgent, self.url)