mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-08 14:44:46 +00:00
HTTPS support
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@17 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
c6ae3ba589
commit
d766250a00
16 changed files with 711 additions and 62 deletions
|
|
@ -1,3 +1,5 @@
|
|||
build-stamp
|
||||
sample.html
|
||||
linkchecker-out.*
|
||||
*-out.*
|
||||
*.so
|
||||
*.o
|
||||
|
|
|
|||
|
|
@ -1,10 +1,14 @@
|
|||
29.2.2000
|
||||
* GML output additions
|
||||
* HTTPS support
|
||||
|
||||
28.2.2000
|
||||
* the patched PyLR parser generator works
|
||||
* wrote a GML parser
|
||||
|
||||
25.2.2000
|
||||
* changed the name to LinkChecker. My old Java LinkChecker will
|
||||
disappear because I do not maintain it anymore.
|
||||
disappear because I do not maintain it anymore
|
||||
|
||||
21.2.2000
|
||||
* add -q, --quiet option
|
||||
|
|
|
|||
15
INSTALL
15
INSTALL
|
|
@ -11,8 +11,12 @@ Unix Users:
|
|||
1. Edit the file linkchecker.
|
||||
Adjust the argument to sys.path.append to point to the distribution
|
||||
directory.
|
||||
2. Copy linkchecker to a location in your PATH (or make a symlink).
|
||||
3. Check links happily by typing `linkchecker`.
|
||||
2. HTTPS support (optional, you need SSLeay)
|
||||
Adjust the paths at the top of the Makefile
|
||||
Type "make" to produce the SSL module
|
||||
3. Copy linkchecker to a location in your PATH (or make a symlink).
|
||||
4. Check links happily by typing `linkchecker`.
|
||||
|
||||
|
||||
Windows Users:
|
||||
1. Edit the file linkchecker.
|
||||
|
|
@ -21,8 +25,11 @@ Windows Users:
|
|||
2. Edit the file linkchecker.bat.
|
||||
a) Adjust the PYHTON variable to point to python.exe.
|
||||
b) Adjust the LINKCHECKER variable to point to the distribution directory.
|
||||
3. Add the distribution directory to your PATH.
|
||||
4. Check links happily by typing `linkchecker.bat`.
|
||||
3. HTTPS support (optional, you need SSLeay)
|
||||
Compile ssl.dll from ssl.c
|
||||
4. Add the distribution directory to your PATH.
|
||||
5. Check links happily by typing `linkchecker.bat`.
|
||||
|
||||
|
||||
You need Python >= 1.5.2
|
||||
You get Python from http://www.python.org
|
||||
|
|
|
|||
24
Makefile
24
Makefile
|
|
@ -1,4 +1,14 @@
|
|||
VERSION=0.9.0
|
||||
PY_INCLDIR = -I/usr/include/python1.5
|
||||
PY_LIBDIR = -L/usr/lib
|
||||
SSL_INCLDIR = -I/usr/include/openssl
|
||||
SSL_LIBDIR = -L/usr/lib
|
||||
|
||||
CC = gcc
|
||||
CFLAGS = -O2 -Wall
|
||||
LDFLAGS = -shared $(SSL_LIBDIR) $(PY_LIBDIR)
|
||||
CPPFLAGS = $(SSL_INCLDIR) $(PY_INCLDIR)
|
||||
|
||||
VERSION=1.1.0
|
||||
HOST=treasure.calvinsplayground.de
|
||||
#HOST=fsinfo.cs.uni-sb.de
|
||||
PACKAGE = linkchecker
|
||||
|
|
@ -9,12 +19,14 @@ ALLPACKAGES = ../$(BZ2PACKAGE) ../$(DEBPACKAGE) ../$(ZIPPACKAGE)
|
|||
.PHONY: test clean files install all
|
||||
TAR = tar
|
||||
ZIP = zip
|
||||
prefix = /usr/local
|
||||
|
||||
all:
|
||||
all: ssl.so
|
||||
|
||||
ssl.so: ssl.o
|
||||
$(CC) $(LDFLAGS) -o $@ $? -lssl -lcrypto -lpython1.5
|
||||
|
||||
clean:
|
||||
rm -f $(ALLPACKAGES) $(PACKAGE)-out.*
|
||||
rm -f ssl.{so,o} $(ALLPACKAGES) $(PACKAGE)-out.*
|
||||
|
||||
files: all
|
||||
./$(PACKAGE) -q -Wtext -Whtml -Wgml -Wsql -R -r2 -v -i "$(HOST)" http://$(HOST)/~calvin/
|
||||
|
|
@ -22,7 +34,7 @@ files: all
|
|||
install: install-dirs
|
||||
install -m644 linkcheck/*.py? $(DESTDIR)/usr/share/$(PACKAGE)/linkcheck
|
||||
install -m644 DNS/*.py? $(DESTDIR)/usr/share/$(PACKAGE)/DNS
|
||||
install -m644 *.py? $(DESTDIR)/usr/share/$(PACKAGE)
|
||||
install -m644 ssl.so *.py? $(DESTDIR)/usr/share/$(PACKAGE)
|
||||
install -m755 $(PACKAGE) $(DESTDIR)/usr/bin
|
||||
install -m644 $(PACKAGE)rc $(DESTDIR)/etc
|
||||
|
||||
|
|
@ -30,8 +42,6 @@ install-dirs:
|
|||
install -d -m755 \
|
||||
$(DESTDIR)/usr/share/$(PACKAGE)/linkcheck \
|
||||
$(DESTDIR)/usr/share/$(PACKAGE)/DNS \
|
||||
$(DESTDIR)/usr/share/$(PACKAGE)/GML \
|
||||
$(DESTDIR)/usr/share/$(PACKAGE)/PyLR \
|
||||
$(DESTDIR)/usr/bin \
|
||||
$(DESTDIR)/etc
|
||||
|
||||
|
|
|
|||
9
README
9
README
|
|
@ -6,8 +6,9 @@ Features:
|
|||
o recursive checking
|
||||
o multithreaded
|
||||
o output can be colored or normal text, HTML, SQL or a GML sitemap graph
|
||||
o HTTP, FTP, mailto:, Gopher, Telnet and local file links are supported
|
||||
Javascript and HTTPS links are currently ignored
|
||||
o HTTP/1.1, HTTPS, FTP, mailto:, Gopher, Telnet and local file links
|
||||
are supported
|
||||
Javascript links are currently ignored
|
||||
o restrict link checking to your local domain
|
||||
o HTTP proxy support
|
||||
o give username/password for HTTP and FTP authorization
|
||||
|
|
@ -23,3 +24,7 @@ robots.txt parse algorithm.
|
|||
I want to thank everybody who gave me feedback, bug reports and
|
||||
suggestions.
|
||||
|
||||
Included packages:
|
||||
httpslib from http://home.att.net/~nvsoft1/ssl_wrapper.html
|
||||
PyLR parser from http://starship.python.net/crew/scott/PyLR.html
|
||||
DNS see README.dns
|
||||
|
|
|
|||
6
TODO
6
TODO
|
|
@ -1,8 +1,6 @@
|
|||
Use leading '_' for private functions.
|
||||
|
||||
Is there a way to cleanly stop arbitrary Thread objects
|
||||
(with exit handler)? Mail me solutions!
|
||||
|
||||
Write a graph layout algorithm.
|
||||
configure script and Debian package cleanups
|
||||
|
||||
Write a little tool to produce an image of the GML output.
|
||||
SSL support
|
||||
|
|
|
|||
170
httpslib.py
Normal file
170
httpslib.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
# @(#)httpslib.py 1.1 VMS-99/01/30 https support
|
||||
|
||||
import ssl,httplib
|
||||
|
||||
HTTP_PREF = 'HTTP/'
|
||||
HTTPS_PORT = 443
|
||||
|
||||
class HTTPS(httplib.HTTP):
|
||||
|
||||
def connect (self, host, port = 0):
|
||||
"""Connect to a host on a given port.
|
||||
|
||||
Note: This method is automatically invoked by __init__,
|
||||
if a host is specified during instantiation.
|
||||
|
||||
"""
|
||||
if not port:
|
||||
i = string.find(host, ':')
|
||||
if i >= 0:
|
||||
host, port = host[:i], host[i+1:]
|
||||
try: port = string.atoi(port)
|
||||
except string.atoi_error:
|
||||
raise socket.error, "nonnumeric port"
|
||||
if not port: port = HTTPS_PORT
|
||||
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
if self.debuglevel > 0: print 'connect:', (host, port)
|
||||
self.sock.connect(host, port)
|
||||
self.ssl = ssl.ssl(self.sock.fileno())
|
||||
|
||||
def send (self, str):
|
||||
if self.debuglevel > 0: print 'send:', `str`
|
||||
self.ssl.write(str,len(str))
|
||||
|
||||
def makefile (self, mode='r', bufsize=-1):
|
||||
return _fileobject(self.sock,self.ssl,mode,bufsize)
|
||||
|
||||
def getreply (self):
|
||||
self.file = self.makefile('rb')
|
||||
# self.sock = None
|
||||
line = self.file.readline()
|
||||
if self.debuglevel > 0: print 'reply:',`line`
|
||||
try:
|
||||
[ver,code,msg] = string.split(line,None,2)
|
||||
except ValueError:
|
||||
try:
|
||||
[ver,code] = string.split(line,None,1)
|
||||
msg = ""
|
||||
except ValueError:
|
||||
ver = ""
|
||||
if ver[:len(HTTP_PREF)] != HTTP_PREF:
|
||||
self.headers = None
|
||||
return -1, line, self.headers
|
||||
self.headers = mimetools.Message(self.file,0)
|
||||
return string.atoi(code), string.strip(msg), self.headers
|
||||
|
||||
def close (self):
|
||||
if self.file:
|
||||
self.file.close()
|
||||
self.file = self.sock = self.ssl = None
|
||||
|
||||
class _fileobject:
|
||||
|
||||
def __init__ (self, sock, ssl, mode, bufsize):
|
||||
import string
|
||||
self._sock = sock
|
||||
self._ssl = ssl
|
||||
self._mode = mode
|
||||
if bufsize < 0:
|
||||
bufsize = 512
|
||||
self._rbufsize = max(1,bufsize)
|
||||
self._wbufsize = bufsize
|
||||
self._wbuf = self._rbuf = ""
|
||||
|
||||
def close (self):
|
||||
try:
|
||||
if self._sock:
|
||||
self.flush()
|
||||
finally:
|
||||
self._sock = None
|
||||
|
||||
def __del__ (self):
|
||||
self.close()
|
||||
|
||||
def flush (self):
|
||||
if self._wbuf:
|
||||
self._sock.write(self._wbuf,len(self._wbuf))
|
||||
self._wbuf = ""
|
||||
|
||||
def fileno (self):
|
||||
return self._sock.fileno()
|
||||
|
||||
def write (self, data):
|
||||
self._wbuf = self._wbuf + data
|
||||
if self._wbufsize == 1:
|
||||
if '\n' in data:
|
||||
self.flush()
|
||||
else:
|
||||
if len(self._wbuf) >= self._wbufsize:
|
||||
self.flush()
|
||||
|
||||
def writelines (self, lst):
|
||||
filter(self._sock.send,lst)
|
||||
self.flush()
|
||||
|
||||
def read (self, n=-1):
|
||||
if n >= 0:
|
||||
while len(self._rbuf) < n:
|
||||
new = self._ssl.read(self._rbufsize)
|
||||
if not new: break
|
||||
self._rbuf = self._rbuf + new
|
||||
data,self._rbuf = self._rbuf[:n],self._rbuf[n:]
|
||||
return data
|
||||
while 1:
|
||||
new = self._ssl.read(self._rbufsize)
|
||||
if not new: break
|
||||
self._rbuf = self._rbuf + new
|
||||
data,self._rbuf = self._rbuf,""
|
||||
return data
|
||||
|
||||
def readline (self):
|
||||
data = ""
|
||||
i = string.find(self._rbuf,'\n')
|
||||
while i < 0:
|
||||
new = self._ssl.read(self._rbufsize)
|
||||
if not new: break
|
||||
i = string.find(new,'\n')
|
||||
if i >= 0: i = i + len(self._rbuf)
|
||||
self._rbuf = self._rbuf + new
|
||||
if i < 0: i = len(self._rbuf)
|
||||
else: i = i+1
|
||||
data,self._rbuf = self._rbuf[:i],self._rbuf[i:]
|
||||
return data
|
||||
|
||||
def readlines (self):
|
||||
l = []
|
||||
while 1:
|
||||
line = self.readline()
|
||||
if not line: break
|
||||
l.append(line)
|
||||
return l
|
||||
|
||||
def _test():
|
||||
import sys
|
||||
import getopt
|
||||
opts, args = getopt.getopt(sys.argv[1:], 'd')
|
||||
dl = 0
|
||||
for o, a in opts:
|
||||
if o == '-d': dl = dl + 1
|
||||
if args[0:]: host = args[0]
|
||||
if args[1:]: selector = args[1]
|
||||
h = HTTPS()
|
||||
host = 'synergy.as.cmu.edu'
|
||||
selector = '/~geek'
|
||||
# host = 'tls.cryptsoft.com'
|
||||
# selector = '/'
|
||||
h.set_debuglevel(dl)
|
||||
h.connect(host)
|
||||
h.putrequest('GET', selector)
|
||||
h.endheaders()
|
||||
errcode, errmsg, headers = h.getreply()
|
||||
print 'errcode =', errcode
|
||||
print 'errmsg =', errmsg
|
||||
print "\tHEADERS:"
|
||||
if headers:
|
||||
for header in headers.headers: print string.strip(header)
|
||||
print "\tTEXT:"
|
||||
print h.getfile().read()
|
||||
|
||||
if __name__ == '__main__':
|
||||
_test()
|
||||
|
|
@ -11,7 +11,7 @@ Copyright = "Copyright
|
|||
HtmlCopyright = "Copyright © 2000 by "+Author
|
||||
AppInfo = App+" "+Copyright
|
||||
HtmlAppInfo = App+", "+HtmlCopyright
|
||||
Url = "http://pylice.sourceforge.net/"
|
||||
Url = "http://linkchecker.sourceforge.net/"
|
||||
Email = "calvin@users.sourceforge.net"
|
||||
Freeware = AppName+""" comes with ABSOLUTELY NO WARRANTY!
|
||||
This is free software, and you are welcome to redistribute it
|
||||
|
|
|
|||
|
|
@ -53,11 +53,11 @@ class HttpUrlData(UrlData):
|
|||
self.setWarning("Access denied by robots.txt, checked only syntax")
|
||||
return
|
||||
|
||||
status, statusText, self.mime = self.getHttpRequest()
|
||||
status, statusText, self.mime = self._getHttpRequest()
|
||||
Config.debug(str(self.mime))
|
||||
if status == 401:
|
||||
self.auth = base64.encodestring(LinkChecker.User+":"+LinkChecker.Password)
|
||||
status, statusText, self.mime = self.getHttpRequest()
|
||||
status, statusText, self.mime = self._getHttpRequest()
|
||||
if status >= 400:
|
||||
self.setError(`status`+" "+statusText)
|
||||
return
|
||||
|
|
@ -68,7 +68,7 @@ class HttpUrlData(UrlData):
|
|||
while status in [301,302] and self.mime and tries < 5:
|
||||
redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
|
||||
self.urlTuple = urlparse.urlparse(redirected)
|
||||
status, statusText, self.mime = self.getHttpRequest()
|
||||
status, statusText, self.mime = self._getHttpRequest()
|
||||
Config.debug("\nRedirected\n"+str(self.mime))
|
||||
tries = tries + 1
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ class HttpUrlData(UrlData):
|
|||
self.setValid(`status`+" "+statusText)
|
||||
|
||||
|
||||
def getHttpRequest(self, method="HEAD"):
|
||||
def _getHttpRequest(self, method="HEAD"):
|
||||
"Put request and return (status code, status text, mime object)"
|
||||
if self.proxy:
|
||||
host = self.proxy+":"+`self.proxyport`
|
||||
|
|
@ -94,7 +94,7 @@ class HttpUrlData(UrlData):
|
|||
host = self.urlTuple[1]
|
||||
if self.urlConnection:
|
||||
self.closeConnection()
|
||||
self.urlConnection = httplib.HTTP(host)
|
||||
self.urlConnection = self._getHTTPObject(host)
|
||||
if self.proxy:
|
||||
path = urlparse.urlunparse(self.urlTuple)
|
||||
else:
|
||||
|
|
@ -110,10 +110,13 @@ class HttpUrlData(UrlData):
|
|||
self.urlConnection.endheaders()
|
||||
return self.urlConnection.getreply()
|
||||
|
||||
def _getHTTPObject(self, host):
|
||||
return httplib.HTTP(host)
|
||||
|
||||
def getContent(self):
|
||||
self.closeConnection()
|
||||
t = time.time()
|
||||
self.getHttpRequest("GET")
|
||||
self._getHttpRequest("GET")
|
||||
self.urlConnection = self.urlConnection.getfile()
|
||||
data = StringUtil.stripHtmlComments(self.urlConnection.read())
|
||||
self.time = time.time() - t
|
||||
|
|
@ -129,7 +132,7 @@ class HttpUrlData(UrlData):
|
|||
if config.robotsTxtCache_has_key(self.urlTuple[1]):
|
||||
robotsTxt = config.robotsTxtCache_get(self.urlTuple[1])
|
||||
else:
|
||||
robotsTxt = RobotsTxt(self.urlTuple[1], Config.UserAgent)
|
||||
robotsTxt = RobotsTxt(self.urlTuple, Config.UserAgent)
|
||||
Config.debug("DEBUG: "+str(robotsTxt)+"\n")
|
||||
config.robotsTxtCache_set(self.urlTuple[1], robotsTxt)
|
||||
except:
|
||||
|
|
|
|||
|
|
@ -1,13 +1,30 @@
|
|||
from UrlData import UrlData
|
||||
from HttpUrlData import HttpUrlData
|
||||
_supportHttps=1
|
||||
try: import httpslib
|
||||
except: _supportHttps=0
|
||||
|
||||
class HttpsUrlData(HttpUrlData):
|
||||
"""Url link with https scheme"""
|
||||
|
||||
def __init__(self,
|
||||
urlName,
|
||||
recursionLevel,
|
||||
parentName = None,
|
||||
baseRef = None,
|
||||
line = 0, _time = 0):
|
||||
HttpUrlData.__init__(self, urlName, recursionLevel,
|
||||
parentName, baseRef, line, _time)
|
||||
|
||||
def _getHTTPObject(self, host):
|
||||
return httpslib.HTTPS(host)
|
||||
|
||||
class HttpsUrlData(UrlData):
|
||||
"Url link with https scheme"
|
||||
|
||||
def check(self, config):
|
||||
self.setWarning("Https url ignored")
|
||||
self.logMe(config)
|
||||
|
||||
if _supportHttps:
|
||||
HttpUrlData.check(self, config)
|
||||
else:
|
||||
self.setWarning("HTTPS url ignored")
|
||||
self.logMe(config)
|
||||
|
||||
def __str__(self):
|
||||
return "HTTPS link\n"+UrlData.__str__(self)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -9,5 +9,3 @@ class JavascriptUrlData(UrlData):
|
|||
|
||||
def __str__(self):
|
||||
return "Javascript link\n"+UrlData.__str__(self)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ class StandardLogger:
|
|||
self.errors=0
|
||||
self.warnings=0
|
||||
self.fd = fd
|
||||
if fd==sys.stdout:
|
||||
if fd==sys.stdout or fd==sys.stderr:
|
||||
self.willclose=0
|
||||
else:
|
||||
self.willclose=1
|
||||
|
|
@ -282,16 +282,18 @@ class ColoredLogger(StandardLogger):
|
|||
|
||||
|
||||
class GMLLogger(StandardLogger):
|
||||
|
||||
"""GML means Graph Modeling Language. Use a GML tool to see
|
||||
your sitemap graph.
|
||||
"""
|
||||
def __init__(self,fd=sys.stdout):
|
||||
StandardLogger.__init__(self,fd)
|
||||
self.nodes = []
|
||||
|
||||
def init(self):
|
||||
self.fd.write("graph [\n Creator \""+Config.AppName+\
|
||||
"\"\n comment \"you get pylice at "+Config.Url+\
|
||||
"\"\n comment \"write comments and bugs to "+Config.Email+\
|
||||
"\"\n directed 1\n")
|
||||
self.fd.write("# created by "+Config.AppInfo+" at "+_currentTime()+\
|
||||
"\n# you get "+Config.AppName+" at "+Config.Url+\
|
||||
"\n# comment \"write comments and bugs to "+Config.Email+\
|
||||
"\ngraph [\n directed 1\n")
|
||||
self.fd.flush()
|
||||
|
||||
def newUrl(self, urlData):
|
||||
|
|
@ -303,17 +305,28 @@ class GMLLogger(StandardLogger):
|
|||
nodeid = 1
|
||||
for node in self.nodes:
|
||||
if node.url and not writtenNodes.has_key(node.url):
|
||||
self.fd.write(" node [\n id "+`nodeid`+"\n label \""+
|
||||
node.url+"\"\n ]\n")
|
||||
self.fd.write(" node [\n")
|
||||
self.fd.write(" id "+`nodeid`+"\n")
|
||||
self.fd.write(' label "'+node.url+'"'+"\n")
|
||||
if node.time:
|
||||
self.fd.write(" dltime "+`node.time`+"\n")
|
||||
self.fd.write(" extern ")
|
||||
if node.extern: self.fd.write("1")
|
||||
else: self.fd.write("0")
|
||||
self.fd.write("\n ]\n")
|
||||
writtenNodes[node.url] = nodeid
|
||||
nodeid = nodeid + 1
|
||||
# write edges
|
||||
for node in self.nodes:
|
||||
if node.url and node.parentName:
|
||||
self.fd.write(" edge [\n label \""+node.urlName+\
|
||||
"\"\n source "+`writtenNodes[node.parentName]`+\
|
||||
"\n target "+`writtenNodes[node.url]`+\
|
||||
"\n ]\n")
|
||||
self.fd.write(" edge [\n")
|
||||
self.fd.write(' label "'+node.urlName+'"\n')
|
||||
self.fd.write(" source "+`writtenNodes[node.parentName]`+"\n")
|
||||
self.fd.write(" target "+`writtenNodes[node.url]`+"\n")
|
||||
self.fd.write(" valid ")
|
||||
if node.valid: self.fd.write("1")
|
||||
else: self.fd.write("0")
|
||||
self.fd.write("\n ]\n")
|
||||
# end of output
|
||||
self.fd.write("]\n")
|
||||
self.fd.flush()
|
||||
|
|
@ -321,16 +334,15 @@ class GMLLogger(StandardLogger):
|
|||
|
||||
|
||||
class SQLLogger(StandardLogger):
|
||||
""" SQL output, only tested with PostgreSQL"""
|
||||
|
||||
""" SQL output for PostgreSQL, not tested"""
|
||||
def init(self):
|
||||
self.fd.write("-- created by "+Config.AppName+" at "+_currentTime()+\
|
||||
"\n-- you get pylice at "+Config.Url+\
|
||||
self.fd.write("-- created by "+Config.AppInfo+" at "+_currentTime()+\
|
||||
"\n-- you get "+Config.AppName+" at "+Config.Url+\
|
||||
"\n-- write comments and bugs to "+Config.Email+"\n\n")
|
||||
self.fd.flush()
|
||||
|
||||
def newUrl(self, urlData):
|
||||
self.fd.write("insert into pylicedb(urlname,"+\
|
||||
self.fd.write("insert into linksdb(urlname,"+\
|
||||
"recursionlevel,"+\
|
||||
"parentname,"+\
|
||||
"baseref,"+\
|
||||
|
|
|
|||
|
|
@ -1,14 +1,19 @@
|
|||
import re,urlparse,string,httplib,urllib,sys,StringUtil,Config
|
||||
|
||||
class RobotsTxt:
|
||||
def __init__(self, base, useragent):
|
||||
def __init__(self, urltuple, useragent):
|
||||
self.entries = []
|
||||
self.disallowAll = 0
|
||||
self.allowAll = 0
|
||||
self.base = base
|
||||
self.base = urltuple[0]+"://"+urltuple[1]+"/robots.txt"
|
||||
|
||||
try:
|
||||
urlConnection = httplib.HTTP(base)
|
||||
urlConnection = None
|
||||
if urltuple[0]=="http":
|
||||
urlConnection = httplib.HTTP(urltuple[1])
|
||||
else:
|
||||
import httpslib
|
||||
urlConnection = httpslib.HTTPS(urltuple[1])
|
||||
urlConnection.putrequest("GET", "/robots.txt")
|
||||
urlConnection.putheader("User-agent", useragent)
|
||||
urlConnection.endheaders()
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ class UrlData:
|
|||
self.time = _time
|
||||
self.cached = 0
|
||||
self.urlConnection = None
|
||||
self.extern = 1
|
||||
|
||||
|
||||
def setError(self, s):
|
||||
|
|
@ -94,7 +95,9 @@ class UrlData:
|
|||
self.setError("URL is null or empty")
|
||||
self.logMe(config)
|
||||
return
|
||||
try: self.buildUrl()
|
||||
try:
|
||||
self.buildUrl()
|
||||
self.extern = self._isExtern(config)
|
||||
except:
|
||||
type, value = sys.exc_info()[:2]
|
||||
self.setError(str(value))
|
||||
|
|
@ -111,7 +114,7 @@ class UrlData:
|
|||
|
||||
# apply filter
|
||||
Config.debug("DEBUG: checking filter\n")
|
||||
if config["strict"] and self.isExtern(config):
|
||||
if config["strict"] and self.extern:
|
||||
self.setWarning("outside of domain filter, checked only syntax")
|
||||
self.logMe(config)
|
||||
return
|
||||
|
|
@ -161,7 +164,7 @@ class UrlData:
|
|||
self.isHtml() and \
|
||||
not self.cached and \
|
||||
self.recursionLevel < config["recursionlevel"] and \
|
||||
not self.isExtern(config)
|
||||
not self.extern
|
||||
|
||||
def isHtml(self):
|
||||
return 0
|
||||
|
|
@ -174,7 +177,7 @@ class UrlData:
|
|||
return
|
||||
self.setWarning("anchor #"+anchor+" not found")
|
||||
|
||||
def isExtern(self, config):
|
||||
def _isExtern(self, config):
|
||||
if len(config["externlinks"])==0 and len(config["internlinks"])==0:
|
||||
return 0
|
||||
# deny and allow external checking
|
||||
|
|
|
|||
|
|
@ -5,4 +5,4 @@ set PYTHON=c:\progra~1\python\python.exe
|
|||
set LINKCHECKER=c:\progra~1\linkchecker-1.1.0
|
||||
rem === end configure ===
|
||||
|
||||
%PYTHON% %LINKCHECKER%\pylice %1 %2 %3 %4 %5 %6 %7 %8 %9
|
||||
%PYTHON% %LINKCHECKER%\linkchecker %1 %2 %3 %4 %5 %6 %7 %8 %9
|
||||
|
|
|
|||
415
ssl.c
Normal file
415
ssl.c
Normal file
|
|
@ -0,0 +1,415 @@
|
|||
/* @(#)ssl.c 1.1 VMS-99/01/30 python wrapper for SSLeay https
|
||||
*/
|
||||
|
||||
#include "Python.h"
|
||||
#if defined(WITH_THREAD) && !defined(HAVE_GETHOSTBYNAME_R) &&\
|
||||
!defined(MS_WINDOWS)
|
||||
#include "thread.h"
|
||||
#endif
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifndef MS_WINDOWS
|
||||
#include <sys/socket.h>
|
||||
#else
|
||||
#include <winsock.h>
|
||||
#endif
|
||||
|
||||
#if defined(PYOS_OS2)
|
||||
#define INCL_DOS
|
||||
#define INCL_DOSERRORS
|
||||
#define INCL_NOPMAPI
|
||||
#include <os2.h>
|
||||
#endif
|
||||
|
||||
#include "ssl.h"
|
||||
#include "err.h"
|
||||
|
||||
/*
|
||||
some hacks to choose between K&R or ANSI style function
|
||||
definitions. For NT to build this as an extension module (ie, DLL)
|
||||
it must be compiled by the C++ compiler, as it takes the address of
|
||||
a static data item exported from the main Python DLL.
|
||||
*/
|
||||
#ifdef MS_WINDOWS
|
||||
#define FORCE_ANSI_FUNC_DEFS
|
||||
#endif
|
||||
|
||||
#if defined(PYOS_OS2)
|
||||
#define FORCE_ANSI_FUNC_DEFS
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ANSI_FUNC_DEFS
|
||||
#define BUILD_FUNC_DEF_1( fnname, arg1type, arg1name ) \
|
||||
fnname( arg1type arg1name )
|
||||
|
||||
#define BUILD_FUNC_DEF_2( fnname, arg1type, arg1name, arg2type, arg2name ) \
|
||||
fnname( arg1type arg1name, arg2type arg2name )
|
||||
|
||||
#else /* !FORCE_ANSI_FN_DEFS */
|
||||
#define BUILD_FUNC_DEF_1( fnname, arg1type, arg1name ) \
|
||||
fnname( arg1name ) \
|
||||
arg1type arg1name;
|
||||
|
||||
#define BUILD_FUNC_DEF_2( fnname, arg1type, arg1name, arg2type, arg2name ) \
|
||||
fnname( arg1name, arg2name ) \
|
||||
arg1type arg1name; \
|
||||
arg2type arg2name;
|
||||
#endif /* !FORCE_ANSI_FN_DEFS */
|
||||
|
||||
/* Global variable holding the exception type for errors detected
|
||||
by this module (but not argument type or memory errors, etc.). */
|
||||
|
||||
static PyObject *PySslError;
|
||||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
int sock_fd;
|
||||
PyObject *x_attr; /* attributes dictionary */
|
||||
SSL_CTX *ctx;
|
||||
SSL *ssl;
|
||||
X509 *server_cert;
|
||||
BIO *sbio;
|
||||
char server[256];
|
||||
char issuer[256];
|
||||
} PySslObject;
|
||||
|
||||
staticforward PyTypeObject SSL_Type;
|
||||
#define PySslObject_Check(v) ((v)->ob_type == &SSL_Type)
|
||||
|
||||
/*
|
||||
* raise an error according to errno, return NULL
|
||||
*/
|
||||
static PyObject *
|
||||
PySsl_errno ()
|
||||
{
|
||||
#ifdef MS_WINDOWS
|
||||
if (WSAGetLastError()) {
|
||||
PyObject *v = Py_BuildValue("(is)",WSAGetLastError(),"winsock error");
|
||||
|
||||
if (v) {
|
||||
PyErr_SetObject(PySslError,v);
|
||||
Py_DECREF(v);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
return PyErr_SetFromErrno(PySslError);
|
||||
}
|
||||
|
||||
/*
|
||||
* format SSl error string
|
||||
*/
|
||||
static int
|
||||
BUILD_FUNC_DEF_2 (PySsl_err_str, unsigned long, e, char *, buf)
|
||||
{
|
||||
unsigned long l = ERR_GET_LIB(e);
|
||||
unsigned long f = ERR_GET_FUNC(e);
|
||||
unsigned long r = ERR_GET_REASON(e);
|
||||
char* ls = (char*)ERR_lib_error_string(e);
|
||||
char* fs = (char*)ERR_func_error_string(e);
|
||||
char* rs = (char*)ERR_reason_error_string(e);
|
||||
char* bp = buf + 2; /* skip two initial blanks */
|
||||
|
||||
(void)strcpy(buf," none:"); /* initialize buffer */
|
||||
bp += (ls) ? sprintf(bp,"%s:",ls) :
|
||||
((l) ? sprintf(bp,"lib %lu:",l) : 0);
|
||||
bp += (fs) ? sprintf(bp,"%s ",fs) :
|
||||
((f) ? sprintf(bp,"func %lu:",f) : 0);
|
||||
bp += (rs) ? sprintf(bp,"%s:",rs) :
|
||||
((r) ? sprintf(bp,"reason(%lu):",r) : 0);
|
||||
*bp-- = 0; /* suppress last divider (:) */
|
||||
return (bp - buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* report SSL core errors
|
||||
*/
|
||||
static PySslObject *
|
||||
PySsl_errors ()
|
||||
{
|
||||
#define PY_SSL_ERR_MAX 256
|
||||
|
||||
unsigned long e;
|
||||
char buf[2 * PY_SSL_ERR_MAX];
|
||||
char *bf = buf;
|
||||
|
||||
while (((bf - buf) < PY_SSL_ERR_MAX) && (e = ERR_get_error()))
|
||||
bf += PySsl_err_str(e,bf);
|
||||
{
|
||||
PyObject *v = Py_BuildValue("(sss)", "ssl","core",buf+2);
|
||||
if (v != NULL) {
|
||||
PyErr_SetObject(PySslError,v);
|
||||
Py_DECREF(v);
|
||||
}
|
||||
}
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* report SSL application layer errors
|
||||
*/
|
||||
static PySslObject *
|
||||
BUILD_FUNC_DEF_2 (PySsl_app_errors, SSL *, s, int, ret)
|
||||
{
|
||||
int err = SSL_get_error(s,ret);
|
||||
char *str;
|
||||
|
||||
switch (err) {
|
||||
case SSL_ERROR_SSL:
|
||||
return (PySsl_errors());
|
||||
case SSL_ERROR_SYSCALL:
|
||||
return ((PySslObject *)PySsl_errno());
|
||||
case SSL_ERROR_ZERO_RETURN:
|
||||
str = "End of data";
|
||||
break;
|
||||
case SSL_ERROR_WANT_READ:
|
||||
str = "Want read";
|
||||
break;
|
||||
case SSL_ERROR_WANT_WRITE:
|
||||
str = "Want write";
|
||||
break;
|
||||
case SSL_ERROR_WANT_X509_LOOKUP:
|
||||
str = "Want x509 lookup";
|
||||
break;
|
||||
case SSL_ERROR_WANT_CONNECT:
|
||||
str = "Want connect";
|
||||
break;
|
||||
default:
|
||||
str = "Unknown";
|
||||
break;
|
||||
}
|
||||
{
|
||||
PyObject *v = Py_BuildValue("(sis)", "ssl",err, str);
|
||||
if (v != NULL) {
|
||||
PyErr_SetObject(PySslError,v);
|
||||
Py_DECREF(v);
|
||||
}
|
||||
}
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/* ssl.read(len) method */
|
||||
|
||||
static PyObject *
|
||||
BUILD_FUNC_DEF_2 (PySslObj_read, PySslObject *, self, PyObject *, args)
|
||||
{
|
||||
int len, n;
|
||||
PyObject *buf;
|
||||
|
||||
if (!PyArg_ParseTuple(args,"i",&len))
|
||||
return (NULL);
|
||||
if (!(buf = PyString_FromStringAndSize((char *)0,len)))
|
||||
return (NULL);
|
||||
Py_BEGIN_ALLOW_THREADS
|
||||
|
||||
n = SSL_read(self->ssl,PyString_AsString(buf),len);
|
||||
|
||||
Py_END_ALLOW_THREADS
|
||||
|
||||
switch (SSL_get_error(self->ssl,n)) {
|
||||
case SSL_ERROR_NONE: /* good return value */
|
||||
break;
|
||||
case SSL_ERROR_ZERO_RETURN:
|
||||
case SSL_ERROR_SYSCALL:
|
||||
if (!n) /* fix SSL_ERROR_SYCSALL errno=0 case */
|
||||
break;
|
||||
/* fall thru here */
|
||||
default:
|
||||
Py_DECREF(buf);
|
||||
(void)PySsl_app_errors(self->ssl,n);
|
||||
return (NULL);
|
||||
}
|
||||
if ((n != len) && (_PyString_Resize(&buf,n) < 0))
|
||||
return (NULL);
|
||||
return (buf);
|
||||
}
|
||||
|
||||
/* ssl.write(data,len) method */
|
||||
|
||||
static PyObject *
|
||||
BUILD_FUNC_DEF_2 (PySslObj_write, PySslObject *, self, PyObject *, args)
|
||||
{
|
||||
char *buf;
|
||||
int len, n;
|
||||
if (!PyArg_ParseTuple(args, "si", &buf, &len))
|
||||
return NULL;
|
||||
|
||||
/* Note: flags are ignored */
|
||||
|
||||
Py_BEGIN_ALLOW_THREADS
|
||||
|
||||
n = SSL_write(self->ssl,buf,len);
|
||||
|
||||
Py_END_ALLOW_THREADS
|
||||
if (n < 0)
|
||||
return (PySsl_errno());
|
||||
return (PyInt_FromLong((long)n));
|
||||
}
|
||||
|
||||
/* ssl.server() method */
|
||||
|
||||
static PyObject *
|
||||
BUILD_FUNC_DEF_2 (PySslObj_server, PySslObject *, self, PyObject *, args)
|
||||
{
|
||||
if (!PyArg_NoArgs(args))
|
||||
return (NULL);
|
||||
return (PyString_FromString(self->server));
|
||||
}
|
||||
|
||||
/* ssl.issuer() method */
|
||||
|
||||
static PyObject *
|
||||
BUILD_FUNC_DEF_2 (PySslObj_issuer, PySslObject *, self, PyObject *, args)
|
||||
{
|
||||
if (!PyArg_NoArgs(args))
|
||||
return (NULL);
|
||||
return (PyString_FromString(self->issuer));
|
||||
}
|
||||
|
||||
/* SSL object methods */
|
||||
|
||||
static PyMethodDef PySslObj_methods[] = {
|
||||
{"read", (PyCFunction)PySslObj_read,1},
|
||||
{"write", (PyCFunction)PySslObj_write,1},
|
||||
{"server", (PyCFunction)PySslObj_server},
|
||||
{"issuer", (PyCFunction)PySslObj_issuer},
|
||||
{ NULL, NULL}
|
||||
};
|
||||
|
||||
static void
|
||||
BUILD_FUNC_DEF_1 (PySsl_dealloc, PySslObject *, self)
|
||||
{
|
||||
if (self->server_cert) /* possible not to have one? */
|
||||
X509_free(self->server_cert);
|
||||
SSL_CTX_free(self->ctx);
|
||||
SSL_free(self->ssl);
|
||||
Py_XDECREF(self->x_attr);
|
||||
PyMem_DEL(self);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
BUILD_FUNC_DEF_2 (PySsl_getattr, PySslObject *, self, char *, name)
|
||||
{
|
||||
return (Py_FindMethod(PySslObj_methods,(PyObject *)self,name));
|
||||
}
|
||||
|
||||
staticforward PyTypeObject SSL_Type = {
|
||||
PyObject_HEAD_INIT(&PyType_Type)
|
||||
0, /*ob_size*/
|
||||
"SSL", /*tp_name*/
|
||||
sizeof(PySslObject), /*tp_basicsize*/
|
||||
0, /*tp_itemsize*/
|
||||
/* methods */
|
||||
(destructor)PySsl_dealloc, /*tp_dealloc*/
|
||||
0, /*tp_print*/
|
||||
(getattrfunc)PySsl_getattr, /*tp_getattr*/
|
||||
0, /*tp_setattr*/
|
||||
0, /*tp_compare*/
|
||||
0, /*tp_repr*/
|
||||
0, /*tp_as_number*/
|
||||
0, /*tp_as_sequence*/
|
||||
0, /*tp_as_mapping*/
|
||||
0, /*tp_hash*/
|
||||
};
|
||||
|
||||
/*
|
||||
* C function called for new object initialization
|
||||
* Note: SSL protocol version 2, 3, or 2+3 set at compile time
|
||||
*/
|
||||
static PySslObject *
|
||||
BUILD_FUNC_DEF_1 (newPySslObject, int, sock_fd)
|
||||
{
|
||||
PySslObject *self;
|
||||
SSL_METHOD *meth;
|
||||
int ret;
|
||||
|
||||
#if 0
|
||||
meth=SSLv3_client_method();
|
||||
meth=SSLv23_client_method();
|
||||
#endif
|
||||
|
||||
meth=SSLv2_client_method();
|
||||
|
||||
if (!(self = PyObject_NEW(PySslObject,&SSL_Type))) /* create new object */
|
||||
return (NULL);
|
||||
(void)memset(self->server,0,sizeof(self->server));
|
||||
(void)memset(self->issuer,0,sizeof(self->issuer));
|
||||
|
||||
self->x_attr = PyDict_New();
|
||||
if (!(self->ctx = SSL_CTX_new(meth))) { /* set up context */
|
||||
PyMem_DEL(self);
|
||||
return (PySsl_errors());
|
||||
}
|
||||
#if 0 /* Note: set this for v23, Netscape server */
|
||||
SSL_CTX_set_options(self->ctx,SSL_OP_ALL);
|
||||
#endif
|
||||
self->ssl = SSL_new(self->ctx); /* new ssl struct */
|
||||
if (!(ret = SSL_set_fd(self->ssl,sock_fd))) { /* set the socket for SSL */
|
||||
PyMem_DEL(self);
|
||||
return (PySsl_app_errors(self->ssl,ret));
|
||||
}
|
||||
SSL_CTX_set_verify(self->ctx,SSL_VERIFY_NONE,NULL); /* set verify lvl */
|
||||
SSL_set_connect_state(self->ssl);
|
||||
|
||||
if ((ret = SSL_connect(self->ssl)) < 0) { /* negotiate SSL connection */
|
||||
PyMem_DEL(self);
|
||||
return (PySsl_app_errors(self->ssl,ret));
|
||||
}
|
||||
self->ssl->debug = 1;
|
||||
|
||||
if ((self->server_cert = SSL_get_peer_certificate(self->ssl))) {
|
||||
X509_NAME_oneline(X509_get_subject_name(self->server_cert),
|
||||
self->server,sizeof(self->server));
|
||||
X509_NAME_oneline(X509_get_issuer_name(self->server_cert),
|
||||
self->issuer, sizeof(self->issuer));
|
||||
}
|
||||
self->x_attr = NULL;
|
||||
self->sock_fd = sock_fd;
|
||||
return (self);
|
||||
}
|
||||
|
||||
/*
|
||||
* Python function called for new object initialization
|
||||
*/
|
||||
static PyObject *
|
||||
BUILD_FUNC_DEF_2 (PySsl_ssl_new, PyObject *, self, PyObject *, args)
|
||||
{
|
||||
int sock_fd;
|
||||
if (!PyArg_ParseTuple(args, "i", &sock_fd))
|
||||
return (NULL);
|
||||
return ((PyObject *)newPySslObject(sock_fd));
|
||||
}
|
||||
|
||||
/* List of functions exported by this module. */
|
||||
|
||||
static PyMethodDef PySsl_methods[] = {
|
||||
{"ssl", (PyCFunction)PySsl_ssl_new, 1},
|
||||
{NULL, NULL} /* sentinel */
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
* Initialize this module, called when the first 'import ssl' is done
|
||||
*/
|
||||
void
|
||||
initssl ()
|
||||
{
|
||||
PyObject *m, *d;
|
||||
m = Py_InitModule("ssl", PySsl_methods);
|
||||
d = PyModule_GetDict(m);
|
||||
|
||||
SSL_load_error_strings();
|
||||
SSLeay_add_ssl_algorithms();
|
||||
|
||||
/* *** Python 1.5 ***
|
||||
if (!(PySssl_Error = PyErr_NewException("ssl.error",NULL,NULL)))
|
||||
return;
|
||||
*/
|
||||
|
||||
if (!(PySslError = PyString_FromString("ssl.error")) ||
|
||||
PyDict_SetItemString(d,"error",PySslError))
|
||||
Py_FatalError("can't define ssl.error");
|
||||
if (PyDict_SetItemString(d,"SSLType",(PyObject *)&SSL_Type))
|
||||
return;
|
||||
}
|
||||
|
||||
Loading…
Reference in a new issue