mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-22 00:40:30 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@82 e7d03fd6-7b0d-0410-9947-9c21f3af8025
205 lines
8.2 KiB
Python
205 lines
8.2 KiB
Python
"""
|
|
Copyright (C) 2000 Bastian Kleineidam
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
"""
|
|
import http11lib,urlparse,sys,time,re
|
|
from UrlData import UrlData
|
|
from RobotsTxt import RobotsTxt
|
|
import Config,StringUtil
|
|
|
|
class HttpUrlData(UrlData):
|
|
"Url link with http scheme"
|
|
netscape_re = re.compile("Netscape-Enterprise/")
|
|
|
|
def checkConnection(self, config):
|
|
"""
|
|
Check a URL with HTTP protocol.
|
|
Here is an excerpt from RFC 1945 with common response codes:
|
|
The first digit of the Status-Code defines the class of response. The
|
|
last two digits do not have any categorization role. There are 5
|
|
values for the first digit:
|
|
o 1xx: Informational - Not used, but reserved for future use
|
|
o 2xx: Success - The action was successfully received,
|
|
understood, and accepted.
|
|
o 3xx: Redirection - Further action must be taken in order to
|
|
complete the request
|
|
o 4xx: Client Error - The request contains bad syntax or cannot
|
|
be fulfilled
|
|
o 5xx: Server Error - The server failed to fulfill an apparently
|
|
valid request
|
|
The individual values of the numeric status codes defined for
|
|
HTTP/1.0, and an example set of corresponding Reason-Phrase's, are
|
|
presented below. The reason phrases listed here are only recommended
|
|
-- they may be replaced by local equivalents without affecting the
|
|
protocol. These codes are fully defined in Section 9.
|
|
Status-Code = "200" ; OK
|
|
| "201" ; Created
|
|
| "202" ; Accepted
|
|
| "204" ; No Content
|
|
| "301" ; Moved Permanently
|
|
| "302" ; Moved Temporarily
|
|
| "304" ; Not Modified
|
|
| "400" ; Bad Request
|
|
| "401" ; Unauthorized
|
|
| "403" ; Forbidden
|
|
| "404" ; Not Found
|
|
| "500" ; Internal Server Error
|
|
| "501" ; Not Implemented
|
|
| "502" ; Bad Gateway
|
|
| "503" ; Service Unavailable
|
|
| extension-code
|
|
"""
|
|
|
|
self.mime = None
|
|
self.auth = None
|
|
self.proxy = config["proxy"]
|
|
self.proxyport = config["proxyport"]
|
|
if config["robotstxt"] and not self.robotsTxtAllowsUrl(config):
|
|
self.setWarning("Access denied by robots.txt, checked only syntax")
|
|
return
|
|
|
|
# first try
|
|
status, statusText, self.mime = self._getHttpRequest()
|
|
Config.debug(str(status)+", "+str(statusText)+", "+str(self.mime)+"\n")
|
|
has301status = 0
|
|
while 1:
|
|
# follow redirections
|
|
tries = 0
|
|
redirected = self.urlName
|
|
while status in [301,302] and self.mime and tries < 5:
|
|
has301status = status==301
|
|
redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
|
|
self.urlTuple = urlparse.urlparse(redirected)
|
|
status, statusText, self.mime = self._getHttpRequest()
|
|
Config.debug("\nRedirected\n"+str(self.mime))
|
|
tries = tries + 1
|
|
|
|
# authentication
|
|
if status==401:
|
|
if not self.auth:
|
|
import base64,string
|
|
_user, _password = self._getUserPassword(config)
|
|
self.auth = "Basic "+\
|
|
string.strip(base64.encodestring(_user+":"+_password))
|
|
status, statusText, self.mime = self._getHttpRequest()
|
|
Config.debug("Authentication "+_user+"/"+_password+"\n")
|
|
|
|
# Netscape Enterprise Server returns errors with HEAD
|
|
# request, but valid urls with GET request. Bummer!
|
|
elif status>=400 and self.mime:
|
|
server = self.mime.getheader("Server")
|
|
if server and self.netscape_re.search(server):
|
|
status, statusText, self.mime = self._getHttpRequest("GET")
|
|
Config.debug("Netscape Enterprise Server detected\n")
|
|
if status not in [301,302]: break
|
|
|
|
effectiveurl = urlparse.urlunparse(self.urlTuple)
|
|
if self.url != effectiveurl:
|
|
self.setWarning("Effective URL "+effectiveurl)
|
|
self.url = effectiveurl
|
|
|
|
if has301status:
|
|
self.setWarning("HTTP 301 (moved permanent) encountered: "
|
|
"you should update this link")
|
|
# check final result
|
|
if status >= 400:
|
|
self.setError(`status`+" "+statusText)
|
|
else:
|
|
if status == 204:
|
|
# no content
|
|
self.setWarning(statusText)
|
|
if status >= 200:
|
|
self.setValid(`status`+" "+statusText)
|
|
else:
|
|
self.setValid("OK")
|
|
|
|
|
|
def _getHttpRequest(self, method="HEAD"):
|
|
"Put request and return (status code, status text, mime object)"
|
|
if self.proxy:
|
|
Config.debug("Using proxy "+self.proxy)
|
|
host = self.proxy+":"+`self.proxyport`
|
|
else:
|
|
host = self.urlTuple[1]
|
|
if self.urlConnection:
|
|
self.closeConnection()
|
|
self.urlConnection = self._getHTTPObject(host)
|
|
if self.proxy:
|
|
path = urlparse.urlunparse(self.urlTuple)
|
|
else:
|
|
path = self.urlTuple[2]
|
|
if self.urlTuple[3]:
|
|
path = path + ";" + self.urlTuple[3]
|
|
if self.urlTuple[4]:
|
|
path = path + "?" + self.urlTuple[4]
|
|
self.urlConnection.putrequest(method, path)
|
|
self.urlConnection.putheader("Host", self.urlTuple[1])
|
|
if self.auth:
|
|
self.urlConnection.putheader("Authorization", self.auth)
|
|
self.urlConnection.putheader("User-agent", Config.UserAgent)
|
|
self.urlConnection.endheaders()
|
|
return self.urlConnection.getreply()
|
|
|
|
def _getHTTPObject(self, host):
|
|
return http11lib.HTTP(host)
|
|
|
|
def getContent(self):
|
|
if not self.data:
|
|
self.closeConnection()
|
|
t = time.time()
|
|
status, statusText, self.mime = self._getHttpRequest("GET")
|
|
self.urlConnection = self.urlConnection.getfile()
|
|
self.data = self.urlConnection.read()
|
|
self.downloadtime = time.time() - t
|
|
self._init_html_comments()
|
|
#Config.debug(Config.DebugDelim+self.data+Config.DebugDelim)
|
|
|
|
def isHtml(self):
|
|
if not (self.valid and self.mime):
|
|
return 0
|
|
# some web servers (Zope) only know the mime-type when they have
|
|
# to render the whole page. Before that, they return
|
|
# "application/octet-stream"
|
|
if self.mime.gettype()=="application/octet-stream":
|
|
self.closeConnection()
|
|
self.mime = self._getHttpRequest("GET")[2]
|
|
if not self.mime: return 0
|
|
return self.mime.gettype()=="text/html"
|
|
|
|
def robotsTxtAllowsUrl(self, config):
|
|
try:
|
|
if config.robotsTxtCache_has_key(self.urlTuple[1]):
|
|
robotsTxt = config.robotsTxtCache_get(self.urlTuple[1])
|
|
else:
|
|
robotsTxt = RobotsTxt(self.urlTuple, Config.UserAgent)
|
|
Config.debug("DEBUG: "+str(robotsTxt)+"\n")
|
|
config.robotsTxtCache_set(self.urlTuple[1], robotsTxt)
|
|
except:
|
|
type, value = sys.exc_info()[:2]
|
|
Config.debug("Heieiei: "+str(value)+"\n")
|
|
return 1
|
|
return robotsTxt.allowance(Config.UserAgent, self.urlTuple[2])
|
|
|
|
|
|
def __str__(self):
|
|
return "HTTP link\n"+UrlData.__str__(self)
|
|
|
|
def closeConnection(self):
|
|
if self.mime:
|
|
try: self.mime.close()
|
|
except: pass
|
|
self.mime = None
|
|
UrlData.closeConnection(self)
|