linkchecker/linkcheck/HttpUrlData.py
2000-03-20 20:19:34 +00:00

166 lines
6.4 KiB
Python

import http11lib,urlparse,sys,time
from UrlData import UrlData
from RobotsTxt import RobotsTxt
import Config,StringUtil
class HttpUrlData(UrlData):
"Url link with http scheme"
def checkConnection(self, config):
"""
Check a URL with HTTP protocol.
Here is an excerpt from RFC 1945 with common response codes:
The first digit of the Status-Code defines the class of response. The
last two digits do not have any categorization role. There are 5
values for the first digit:
o 1xx: Informational - Not used, but reserved for future use
o 2xx: Success - The action was successfully received,
understood, and accepted.
o 3xx: Redirection - Further action must be taken in order to
complete the request
o 4xx: Client Error - The request contains bad syntax or cannot
be fulfilled
o 5xx: Server Error - The server failed to fulfill an apparently
valid request
The individual values of the numeric status codes defined for
HTTP/1.0, and an example set of corresponding Reason-Phrase's, are
presented below. The reason phrases listed here are only recommended
-- they may be replaced by local equivalents without affecting the
protocol. These codes are fully defined in Section 9.
Status-Code = "200" ; OK
| "201" ; Created
| "202" ; Accepted
| "204" ; No Content
| "301" ; Moved Permanently
| "302" ; Moved Temporarily
| "304" ; Not Modified
| "400" ; Bad Request
| "401" ; Unauthorized
| "403" ; Forbidden
| "404" ; Not Found
| "500" ; Internal Server Error
| "501" ; Not Implemented
| "502" ; Bad Gateway
| "503" ; Service Unavailable
| extension-code
"""
self.mime = None
self.auth = None
self.proxy = config["proxy"]
self.proxyport = config["proxyport"]
if config["robotstxt"] and not self.robotsTxtAllowsUrl(config):
self.setWarning("Access denied by robots.txt, checked only syntax")
return
status, statusText, self.mime = self._getHttpRequest()
Config.debug(str(status)+", "+str(statusText)+", "+str(self.mime)+"\n")
# follow redirections and set self.url to the effective url
tries = 0
redirected = self.urlName
while status in [301,302] and self.mime and tries < 5:
redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
self.urlTuple = urlparse.urlparse(redirected)
status, statusText, self.mime = self._getHttpRequest()
Config.debug("\nRedirected\n"+str(self.mime))
tries = tries + 1
effectiveurl = urlparse.urlunparse(self.urlTuple)
if self.url != effectiveurl:
self.setWarning("Effective URL "+effectiveurl)
self.url = effectiveurl
#
# authentication
if status == 401:
import base64
_user, _password = self._getUserPassword(config)
status, statusText, self.mime = self._getHttpRequest("HEAD",
"Basic "+string.strip(base64.encodestring(_user+":"+_password)))
#
# check final result
if status >= 400:
self.setError(`status`+" "+statusText)
else:
if status == 204:
self.setWarning(statusText)
if status >= 200:
self.setValid(`status`+" "+statusText)
else:
self.setValid("OK")
def _getHttpRequest(self, method="HEAD", auth=None):
"Put request and return (status code, status text, mime object)"
if self.proxy:
host = self.proxy+":"+`self.proxyport`
else:
host = self.urlTuple[1]
if self.urlConnection:
self.closeConnection()
self.urlConnection = self._getHTTPObject(host)
if self.proxy:
path = urlparse.urlunparse(self.urlTuple)
else:
path = self.urlTuple[2]
if self.urlTuple[3]:
path = path + ";" + self.urlTuple[3]
if self.urlTuple[4]:
path = path + "?" + self.urlTuple[4]
self.urlConnection.putrequest(method, path)
self.urlConnection.putheader("Host", self.urlTuple[1])
if auth:
self.urlConnection.putheader("Authorization", auth)
self.urlConnection.putheader("User-agent", Config.UserAgent)
self.urlConnection.endheaders()
return self.urlConnection.getreply()
def _getHTTPObject(self, host):
return http11lib.HTTP(host)
def getContent(self):
self.closeConnection()
t = time.time()
self._getHttpRequest("GET")
self.urlConnection = self.urlConnection.getfile()
data = StringUtil.stripHtmlComments(self.urlConnection.read())
self.downloadtime = time.time() - t
Config.debug(Config.DebugDelim+data+Config.DebugDelim)
return data
def isHtml(self):
if not (self.valid and self.mime):
return 0
# some web servers (Zope) only know the mime-type when they have
# to render the whole page. Before that, they return
# "application/octet-stream"
if self.mime.gettype()=="application/octet-stream":
self.closeConnection()
self.mime = self._getHttpRequest("GET")[2]
if not self.mime: return 0
return self.mime.gettype()=="text/html"
def robotsTxtAllowsUrl(self, config):
try:
if config.robotsTxtCache_has_key(self.urlTuple[1]):
robotsTxt = config.robotsTxtCache_get(self.urlTuple[1])
else:
robotsTxt = RobotsTxt(self.urlTuple, Config.UserAgent)
Config.debug("DEBUG: "+str(robotsTxt)+"\n")
config.robotsTxtCache_set(self.urlTuple[1], robotsTxt)
except:
type, value = sys.exc_info()[:2]
Config.debug("Heieiei: "+str(value)+"\n")
return 1
return robotsTxt.allowance(Config.UserAgent, self.urlTuple[2])
def __str__(self):
return "HTTP link\n"+UrlData.__str__(self)
def closeConnection(self):
if self.mime:
try: self.mime.close()
except: pass
self.mime = None
UrlData.closeConnection(self)