git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@25 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2000-03-08 12:01:51 +00:00
parent 262915f0af
commit 225a49df6d
8 changed files with 562 additions and 51 deletions

View file

@ -1,3 +1,8 @@
8.3.2000 Version 1.1.1
* FastCGI modules added (no script yet)
* CGI script fixes
* supply strict/non-strict flag for each external filtering rule
7.3.2000
* support for multiple user/password pairs

439
fcgi.py Normal file
View file

@ -0,0 +1,439 @@
#!/usr/bin/env python
#------------------------------------------------------------------------
# Copyright (c) 1998 by Total Control Software
# All Rights Reserved
#------------------------------------------------------------------------
#
# Module Name: fcgi.py
#
# Description: Handles communication with the FastCGI module of the
# web server without using the FastCGI developers kit, but
# will also work in a non-FastCGI environment, (straight CGI.)
# This module was originally fetched from someplace on the
# Net (I don't remember where and I can't find it now...) and
# has been significantly modified to fix several bugs, be more
# readable, more robust at handling large CGI data and return
# document sizes, and also to fit the model that we had previously
# used for FastCGI.
#
# WARNING: If you don't know what you are doing, don't tinker with this
# module!
#
# Creation Date: 1/30/98 2:59:04PM
#
# License: This is free software. You may use this software for any
# purpose including modification/redistribution, so long as
# this header remains intact and that you do not claim any
# rights of ownership or authorship of this software. This
# software has been tested, but no warranty is expressed or
# implied.
#
#------------------------------------------------------------------------
import os, sys, string, socket, errno
from cStringIO import StringIO
import cgi
#---------------------------------------------------------------------------
# Set various FastCGI constants
# Maximum number of requests that can be handled
FCGI_MAX_REQS=1
FCGI_MAX_CONNS = 1
# Supported version of the FastCGI protocol
FCGI_VERSION_1 = 1
# Boolean: can this application multiplex connections?
FCGI_MPXS_CONNS=0
# Record types
FCGI_BEGIN_REQUEST = 1 ; FCGI_ABORT_REQUEST = 2 ; FCGI_END_REQUEST = 3
FCGI_PARAMS = 4 ; FCGI_STDIN = 5 ; FCGI_STDOUT = 6
FCGI_STDERR = 7 ; FCGI_DATA = 8 ; FCGI_GET_VALUES = 9
FCGI_GET_VALUES_RESULT = 10
FCGI_UNKNOWN_TYPE = 11
FCGI_MAXTYPE = FCGI_UNKNOWN_TYPE
# Types of management records
ManagementTypes = [FCGI_GET_VALUES]
FCGI_NULL_REQUEST_ID=0
# Masks for flags component of FCGI_BEGIN_REQUEST
FCGI_KEEP_CONN = 1
# Values for role component of FCGI_BEGIN_REQUEST
FCGI_RESPONDER = 1 ; FCGI_AUTHORIZER = 2 ; FCGI_FILTER = 3
# Values for protocolStatus component of FCGI_END_REQUEST
FCGI_REQUEST_COMPLETE = 0 # Request completed nicely
FCGI_CANT_MPX_CONN = 1 # This app can't multiplex
FCGI_OVERLOADED = 2 # New request rejected; too busy
FCGI_UNKNOWN_ROLE = 3 # Role value not known
error = 'fcgi.error'
#---------------------------------------------------------------------------
# The following function is used during debugging; it isn't called
# anywhere at the moment
def _error(msg):
"Append a string to /tmp/err"
errf=open('/tmp/err', 'a+')
errf.write(msg+'\n')
errf.close()
#---------------------------------------------------------------------------
class record:
"Class representing FastCGI records"
def __init__(self):
self.version = FCGI_VERSION_1
self.recType = FCGI_UNKNOWN_TYPE
self.reqId = FCGI_NULL_REQUEST_ID
self.content = ""
#----------------------------------------
def readRecord(self, sock):
s = map(ord, sock.recv(8))
self.version, self.recType, paddingLength = s[0], s[1], s[6]
self.reqId, contentLength = (s[2]<<8)+s[3], (s[4]<<8)+s[5]
self.content = ""
while len(self.content) < contentLength:
data = sock.recv(contentLength - len(self.content))
self.content = self.content + data
if paddingLength != 0:
padding = sock.recv(paddingLength)
# Parse the content information
c = self.content
if self.recType == FCGI_BEGIN_REQUEST:
self.role = (ord(c[0])<<8) + ord(c[1])
self.flags = ord(c[2])
elif self.recType == FCGI_UNKNOWN_TYPE:
self.unknownType = ord(c[0])
elif self.recType == FCGI_GET_VALUES or self.recType == FCGI_PARAMS:
self.values={}
pos=0
while pos < len(c):
name, value, pos = readPair(c, pos)
self.values[name] = value
elif self.recType == FCGI_END_REQUEST:
b = map(ord, c[0:4])
self.appStatus = (b[0]<<24) + (b[1]<<16) + (b[2]<<8) + b[3]
self.protocolStatus = ord(c[4])
#----------------------------------------
def writeRecord(self, sock):
content = self.content
if self.recType == FCGI_BEGIN_REQUEST:
content = chr(self.role>>8) + chr(self.role & 255) + chr(self.flags) + 5*'\000'
elif self.recType == FCGI_UNKNOWN_TYPE:
content = chr(self.unknownType) + 7*'\000'
elif self.recType==FCGI_GET_VALUES or self.recType==FCGI_PARAMS:
content = ""
for i in self.values.keys():
content = content + writePair(i, self.values[i])
elif self.recType==FCGI_END_REQUEST:
v = self.appStatus
content = chr((v>>24)&255) + chr((v>>16)&255) + chr((v>>8)&255) + chr(v&255)
content = content + chr(self.protocolStatus) + 3*'\000'
cLen = len(content)
eLen = (cLen + 7) & (0xFFFF - 7) # align to an 8-byte boundary
padLen = eLen - cLen
hdr = [ self.version,
self.recType,
self.reqId >> 8,
self.reqId & 255,
cLen >> 8,
cLen & 255,
padLen,
0]
hdr = string.joinfields(map(chr, hdr), '')
sock.send(hdr + content + padLen*'\000')
#---------------------------------------------------------------------------
def readPair(s, pos):
nameLen=ord(s[pos]) ; pos=pos+1
if nameLen & 128:
b=map(ord, s[pos:pos+3]) ; pos=pos+3
nameLen=((nameLen&127)<<24) + (b[0]<<16) + (b[1]<<8) + b[2]
valueLen=ord(s[pos]) ; pos=pos+1
if valueLen & 128:
b=map(ord, s[pos:pos+3]) ; pos=pos+3
valueLen=((valueLen&127)<<24) + (b[0]<<16) + (b[1]<<8) + b[2]
return ( s[pos:pos+nameLen], s[pos+nameLen:pos+nameLen+valueLen],
pos+nameLen+valueLen )
#---------------------------------------------------------------------------
def writePair(name, value):
l=len(name)
if l<128: s=chr(l)
else:
s=chr(128|(l>>24)&255) + chr((l>>16)&255) + chr((l>>8)&255) + chr(l&255)
l=len(value)
if l<128: s=s+chr(l)
else:
s=s+chr(128|(l>>24)&255) + chr((l>>16)&255) + chr((l>>8)&255) + chr(l&255)
return s + name + value
#---------------------------------------------------------------------------
def HandleManTypes(r, conn):
if r.recType == FCGI_GET_VALUES:
r.recType = FCGI_GET_VALUES_RESULT
v={}
vars={'FCGI_MAX_CONNS' : FCGI_MAX_CONNS,
'FCGI_MAX_REQS' : FCGI_MAX_REQS,
'FCGI_MPXS_CONNS': FCGI_MPXS_CONNS}
for i in r.values.keys():
if vars.has_key(i): v[i]=vars[i]
r.values=vars
r.writeRecord(conn)
#---------------------------------------------------------------------------
#---------------------------------------------------------------------------
_isFCGI = 1 # assume it is until we find out for sure
def isFCGI():
global _isFCGI
return _isFCGI
#---------------------------------------------------------------------------
_init = None
_sock = None
class FCGI:
def __init__(self):
self.haveFinished = 0
if _init == None:
_startup()
if not isFCGI():
self.haveFinished = 1
self.inp, self.out, self.err, self.env = \
sys.stdin, sys.stdout, sys.stderr, os.environ
return
if os.environ.has_key('FCGI_WEB_SERVER_ADDRS'):
good_addrs=map(string.strip,
string.split(os.environ['FCGI_WEB_SERVER_ADDRS'], ','))
else:
good_addrs=None
self.conn, addr=_sock.accept()
stdin = data = ""
self.env = {}
self.requestId=0
remaining=1
# Check if the connection is from a legal address
if good_addrs!=None and addr not in good_addrs:
raise error, 'Connection from invalid server!'
while remaining:
r=record(); r.readRecord(self.conn)
if r.recType in ManagementTypes:
HandleManTypes(r, self.conn)
elif r.reqId==0:
# Oh, poopy. It's a management record of an unknown
# type. Signal the error.
r2=record()
r2.recType=FCGI_UNKNOWN_TYPE ; r2.unknownType=r.recType
r2.writeRecord(self.conn)
continue # Charge onwards
# Ignore requests that aren't active
elif r.reqId != self.requestId and r.recType != FCGI_BEGIN_REQUEST:
continue
# If we're already doing a request, ignore further BEGIN_REQUESTs
elif r.recType == FCGI_BEGIN_REQUEST and self.requestId != 0:
continue
# Begin a new request
if r.recType == FCGI_BEGIN_REQUEST:
self.requestId = r.reqId
if r.role == FCGI_AUTHORIZER: remaining=1
elif r.role == FCGI_RESPONDER: remaining=2
elif r.role == FCGI_FILTER: remaining=3
elif r.recType == FCGI_PARAMS:
if r.content == "":
remaining=remaining-1
else:
for i in r.values.keys():
self.env[i] = r.values[i]
elif r.recType == FCGI_STDIN:
if r.content == "":
remaining=remaining-1
else:
stdin=stdin+r.content
elif r.recType==FCGI_DATA:
if r.content == "":
remaining=remaining-1
else:
data=data+r.content
# end of while remaining:
self.inp = sys.stdin = StringIO(stdin)
self.err = sys.stderr = StringIO()
self.out = sys.stdout = StringIO()
self.data = StringIO(data)
def __del__(self):
self.Finish()
def Finish(self, status=0):
if not self.haveFinished:
self.haveFinished = 1
self.err.seek(0,0)
self.out.seek(0,0)
r=record()
r.recType = FCGI_STDERR
r.reqId = self.requestId
data = self.err.read()
while data:
chunk, data = self.getNextChunk(data)
r.content = chunk
r.writeRecord(self.conn)
r.content="" ; r.writeRecord(self.conn) # Terminate stream
r.recType = FCGI_STDOUT
data = self.out.read()
while data:
chunk, data = self.getNextChunk(data)
r.content = chunk
r.writeRecord(self.conn)
r.content="" ; r.writeRecord(self.conn) # Terminate stream
r=record()
r.recType=FCGI_END_REQUEST
r.reqId=self.requestId
r.appStatus=status
r.protocolStatus=FCGI_REQUEST_COMPLETE
r.writeRecord(self.conn)
self.conn.close()
def getFieldStorage(self):
method = 'GET'
if self.env.has_key('REQUEST_METHOD'):
method = string.upper(self.env['REQUEST_METHOD'])
if method == 'GET':
return cgi.FieldStorage(environ=self.env, keep_blank_values=1)
else:
return cgi.FieldStorage(fp=self.inp, environ=self.env, keep_blank_values=1)
def getNextChunk(self, data):
chunk = data[:8192]
data = data[8192:]
return chunk, data
Accept = FCGI # alias for backward compatibility
#---------------------------------------------------------------------------
def _startup():
global _init
_init = 1
try:
s=socket.fromfd(sys.stdin.fileno(), socket.AF_INET,
socket.SOCK_STREAM)
s.getpeername()
except socket.error, (err, errmsg):
if err!=errno.ENOTCONN: # must be a non-fastCGI environment
global _isFCGI
_isFCGI = 0
return
global _sock
_sock = s
#---------------------------------------------------------------------------
def _test():
counter=0
try:
while isFCGI():
req = FCGI()
counter=counter+1
try:
fs = req.getFieldStorage()
size = string.atoi(fs['size'].value)
doc = ['*' * size]
except:
doc = ['<HTML><HEAD><TITLE>FCGI TestApp</TITLE></HEAD>\n<BODY>\n']
doc.append('<H2>FCGI TestApp</H2><P>')
doc.append('<b>request count</b> = %d<br>' % counter)
doc.append('<b>pid</b> = %s<br>' % os.getpid())
if req.env.has_key('CONTENT_LENGTH'):
cl = string.atoi(req.env['CONTENT_LENGTH'])
doc.append('<br><b>POST data (%s):</b><br><pre>' % cl)
keys = fs.keys()
keys.sort()
for k in keys:
val = fs[k]
if type(val) == type([]):
doc.append(' <b>%-15s :</b> %s\n' % (k, val))
else:
doc.append(' <b>%-15s :</b> %s\n' % (k, val.value))
doc.append('</pre>')
doc.append('<P><HR><P><pre>')
keys = req.env.keys()
keys.sort()
for k in keys:
doc.append('<b>%-20s :</b> %s\n' % (k, req.env[k]))
doc.append('\n</pre><P><HR>\n')
doc.append('</BODY></HTML>\n')
doc = string.join(doc, '')
req.out.write('Content-length: %s\r\n'
'Content-type: text/html\r\n'
'Cache-Control: no-cache\r\n'
'\r\n'
% len(doc))
req.out.write(doc)
req.Finish()
except:
import traceback
f = open('traceback', 'w')
traceback.print_exc( file = f )
# f.write('%s' % doc)
if __name__=='__main__':
#import pdb
#pdb.run('_test()')
_test()

59
lc.cgi
View file

@ -5,17 +5,16 @@ import re,cgi,sys,urlparse,time,os
# configuration
sys.stderr = sys.stdout
cgi_dir = "/home/calvin/public_html/cgi-bin"
dist_dir = "/home/calvin/linkchecker-1.1.0"
lc = pylice_dir + "/pylice"
dist_dir = "/home/calvin/projects/linkchecker"
sys.path.insert(0,dist_dir)
cgi.logfile = cgi_dir + "/lc.log"
cgi.logfile = cgi_dir + "/linkchecker.log" # must be an existing file
# end configuration
def testit():
cgi.test()
sys.exit(0)
def checkform():
def checkform(form):
for key in ["level","url"]:
if not form.has_key(key) or form[key].value == "": return 0
if not re.match(r"^http://[-\w./~]+$", form["url"].value): return 0
@ -29,27 +28,22 @@ def checkform():
if not form["intern"].value=="on": return 0
return 1
def getHostName():
return urlparse.urlparse(form["url"].value)[1]
def logit():
logfile = open("/home/calvin/log/linkchecker.log","a")
logfile.write("\n"+time.strftime("%d.%m.%Y %H:%M:%S", time.localtime(time.time()))+"\n")
def logit(form):
cgi.log("\n"+time.strftime("%d.%m.%Y %H:%M:%S", time.localtime(time.time())))
for var in ["HTTP_USER_AGENT","REMOTE_ADDR","REMOTE_HOST","REMOTE_PORT"]:
if os.environ.has_key(var):
logfile.write(var+"="+os.environ[var]+"\n")
cgi.log(var+"="+os.environ[var])
for key in ["level","url","anchors","errors","intern"]:
if form.has_key(key):
logfile.write(str(form[key])+"\n")
logfile.close()
cgi.log(str(form[key]))
def printError():
print """<html><head></head>
<body text="#192c83" bgcolor="#fff7e5" link="#191c83" vlink="#191c83"
alink="#191c83" >
<body text="#192c83" bgcolor="#fff7e5" link="#191c83" vlink="#191c83"
alink="#191c83">
<blockquote>
<b>Error</b><br>
The LinkChecker Online script has encountered an error. Please ensure
@ -60,28 +54,31 @@ Errors are logged.
</body>
</html>
"""
import linkcheck
# main
print "Content-type: text/html"
print "Cache-Control: no-cache"
print
#testit()
form = cgi.FieldStorage()
if not checkform():
logit()
if not checkform(form):
logit(form)
printError()
sys.exit(0)
args=["", "-H", "-r "+form["level"].value, "-s"]
if form.has_key("anchors"):
args.append("-a")
if not form.has_key("errors"):
args.append("-v")
config = linkcheck.Config.Configuration()
config["recursionlevel"] = int(form["level"].value)
config["log"] = linkcheck.Logging.HtmlLogger()
if form.has_key("anchors"): config["anchors"] = 1
if not form.has_key("errors"): config["verbose"] = 1
if form.has_key("intern"):
args.append("--intern=^(ftp|http)://"+getHostName())
config["internlinks"].append(re.compile("^(ftp|https?)://"+getHostName()))
else:
args.append("--extern=^file:")
args.append("--intern=.+")
config["internlinks"].append(re.compile(".+"))
# avoid checking of local files
config["externlinks"].append((re.compile("^file:"), 1))
args.append(form["url"].value)
sys.argv = args
execfile(lc)
# start checking
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(form["url"].value, 0))
linkcheck.checkUrls(config)

View file

@ -308,7 +308,13 @@ class Configuration(UserDict.UserDict):
self.data["authentication"].append((re.compile(".*"), "anonymous", "guest@"))
section = "filtering"
try: self.data["externlinks"].append(re.compile(cfgparser.get(section, "externlinks")))
try:
i=1
while 1:
tuple = string.split(cfgparser.get(section, "extern"+`i`))
if len(tuple)!=2: break
self.data["externlinks"].append((re.compile(tuple[0]),
int(tuple[1])))
except: pass
try: self.data["internlinks"].append(re.compile(cfgparser.get(section, "internlinks")))
except: pass

View file

@ -97,7 +97,7 @@ class UrlData:
return
try:
self.buildUrl()
self.extern = self._isExtern(config)
self.extern = self._getExtern(config)
except:
type, value = sys.exc_info()[:2]
self.setError(str(value))
@ -114,7 +114,7 @@ class UrlData:
# apply filter
Config.debug("DEBUG: checking filter\n")
if config["strict"] and self.extern:
if self.extern and (config["strict"] or self.extern[1]):
self.setWarning("outside of domain filter, checked only syntax")
self.logMe(config)
return
@ -177,8 +177,8 @@ class UrlData:
return
self.setWarning("anchor #"+anchor+" not found")
def _isExtern(self, config):
if len(config["externlinks"])==0 and len(config["internlinks"])==0:
def _getExtern(self, config):
if not (config["externlinks"] or config["internlinks"]):
return 0
# deny and allow external checking
Config.debug(self.url)
@ -186,17 +186,17 @@ class UrlData:
for pat in config["internlinks"]:
if pat.search(self.url):
return 0
for pat in config["externlinks"]:
for pat, strict in config["externlinks"]:
if pat.search(self.url):
return 1
return (1, strict)
else:
for pat in config["externlinks"]:
for pat, strict in config["externlinks"]:
if pat.search(self.url):
return 1
return (1, strict)
for pat in config["internlinks"]:
if pat.search(self.url):
return 0
return 1
return (1,0)
def getContent(self):
"""Precondition: urlConnection is an opened URL.

View file

@ -42,7 +42,7 @@ OPTIONS
Default is no file output.
-p pwd, --password=pwd
Try given password for HTML and FTP authorization.
Default is 'joe@'. See -u.
Default is 'guest@'. See -u.
-P host[:port], --proxy=host[:port]
Use specified proxy for HTTP requests.
Standard port is 8080. Default is to use no proxy.
@ -78,8 +78,9 @@ o If you have your system configured to automatically establish a
connection to the internet (e.g. with diald), it will connect when
checking links not pointing to your local host.
Use the -s and -i options to prevent this (see EXAMPLES).
o Javascript and https links are currently ignored
o Javascript links are currently ignored
o If your platform does not support threading, linkchecker assumes -t0
o You can supply multiple user/password pairs in a configuration file
"""
Examples = """EXAMPLES
@ -110,8 +111,9 @@ def printUsage(msg):
# Read command line arguments
try:
# Note: cut out the name of the script
options, args = getopt.getopt(sys.argv[1:], "aDe:f:hi:lP:o:p:qr:Rst:u:VvwW:",
["anchors",
options, args = getopt.getopt(sys.argv[1:],
"aDe:f:hi:lP:o:p:qr:Rst:u:VvwW:", # short options
["anchors", # long options
"config=",
"debug",
"extern=",
@ -160,7 +162,7 @@ for opt,arg in options:
linkcheck.Config.DebugFlag = 1
elif opt=="-e" or opt=="--extern":
config["externlinks"].append(re.compile(arg))
config["externlinks"].append((re.compile(arg), 0))
elif opt=="-h" or opt=="--help":
printHelp()
@ -192,11 +194,11 @@ for opt,arg in options:
config["proxy"] = arg
elif opt=="-p" or opt=="--password":
_password=arg
constructAuth=1
_password = arg
constructauth = 1
elif opt=="-q" or opt=="--quiet":
config["quiet"]=1
config["quiet"] = 1
elif opt=="-r" or opt=="--recursion-level":
if int(arg) >= 0:

View file

@ -18,9 +18,11 @@ strict=0
#proxy=www-proxy.uni-sb.de
#proxyport=3128
# for each extern link we can specify if it is strict or not
[filtering]
externlinks=
internlinks=
# strict avoid checking of local files
#extern1=^file:.* 1
#internlinks=
allowdeny=0
# You can provide different user/password pairs for different link types.

60
sz_fcgi.py Normal file
View file

@ -0,0 +1,60 @@
# sz_fcgi.py - Multithreaded FastCGI Wrapper
__version__ = "v0.8 19/10/1998 ajung"
__doc__ = "Multithreaded FastCGI Wrapper"
import sys,thread,fcgi
class SZ_FCGI:
# Constructor
def __init__(self,func):
self.func = func
self.handles = {}
return None
# create a new thread to handle requests
def run(self):
try:
while fcgi.isFCGI():
req = fcgi.FCGI()
thread.start_new_thread(self.handle_request,(req,0))
except:
write_log('isCGI() failed')
# Finish thread and send all data back to the FCGI parent
def finish(self):
req = self.handles[thread.get_ident()]
req.Finish()
thread.exit()
# Call function - handled by a thread
def handle_request(self,*args):
req = args[0]
self.handles[thread.get_ident()] = req
try:
self.func(self,req.env,req.getFieldStorage())
except:
pass
# Our own FCGI print routine
def pr(self,*args):
req = self.handles[thread.get_ident()]
try:
s=''
for i in args: s=s+str(i)
req.out.write(s+'\n')
req.out.flush()
except:
pass