Make WSGI script more responsive by using threads.

This commit is contained in:
Bastian Kleineidam 2012-04-18 21:52:36 +02:00
parent 3d831c1adb
commit 1ef9a022ca
8 changed files with 202 additions and 145 deletions

View file

@ -4,7 +4,7 @@ include config/linkchecker-completion config/create.sql
include config/linkcheckerrc
include config/linkchecker.apache2.conf install-rpm.sh
include linkchecker.freecode
include cgi-bin/lc.cgi cgi-bin/README
include cgi-bin/lc.wsgi cgi-bin/README
include Makefile
include debian/rules
include debian/changelog

View file

@ -176,6 +176,7 @@ doccheck:
linkcheck/updater.py \
linkcheck/url.py \
linkcheck/winutil.py \
cgi-bin/lc.wsgi \
linkchecker \
linkchecker-gui \
*.py

View file

@ -1,31 +0,0 @@
#!/usr/bin/python
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2009 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import sys
import cgi
import linkcheck
import linkcheck.lc_cgi
# log errors to stdout
sys.stderr = sys.stdout
# uncomment the following lines to test your CGI values
#cgi.test()
#sys.exit(0)
linkcheck.lc_cgi.startoutput()
linkcheck.lc_cgi.checklink(form=cgi.FieldStorage())

View file

@ -16,30 +16,4 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from cStringIO import StringIO
import cgi
import linkcheck
import linkcheck.lc_cgi
def application(environ, start_response):
# the environment variable CONTENT_LENGTH may be empty or missing
try:
request_body_size = int(environ.get('CONTENT_LENGTH', 0))
except (ValueError):
request_body_size = 0
# When the method is POST the query string will be sent
# in the HTTP request body which is passed by the WSGI server
# in the file like wsgi.input environment variable.
request_body = environ['wsgi.input'].read(request_body_size)
form = cgi.parse_qs(request_body)
status = '200 OK'
start_response(status, linkcheck.lc_cgi.get_response_headers())
output = StringIO()
# XXX this is slow since it checks the whole site before showing
# any out.
# Instead check in a separate thread and yield output as soon
# as it is available.
linkcheck.lc_cgi.checklink(form=form, out=output, env=environ)
return [output.getvalue()]
from linkcheck.lc_cgi import application

View file

@ -24,7 +24,7 @@ from .linkchecker_ui_main import Ui_MainWindow
from .properties import set_properties, clear_properties
from .statistics import set_statistics, clear_statistics
from .debug import LinkCheckerDebug
from .logger import GuiLogger, GuiLogHandler, StatusLogger
from .logger import SignalLogger, GuiLogHandler, StatusLogger
from .help import HelpWindow
from .options import LinkCheckerOptions
from .checker import CheckerThread
@ -226,7 +226,7 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
# dictionary holding overwritten values
self.config_backup = {}
# set standard GUI configuration values
self.config.logger_add("gui", GuiLogger)
self.config.logger_add("gui", SignalLogger)
self.config["logger"] = self.config.logger_new('gui',
signal=self.log_url_signal, stats=self.log_stats_signal)
self.config["status"] = True

View file

@ -32,12 +32,12 @@ class GuiLogHandler (Handler, object):
self.signal.emit(self.format(record))
class GuiLogger (Logger):
"""Delegate log URLs to the UI tree widget."""
class SignalLogger (Logger):
"""Use Qt signals for logged URLs and statistics."""
def __init__ (self, **args):
"""Store signals for URL and statistic data."""
super(GuiLogger, self).__init__(**args)
super(SignalLogger, self).__init__(**args)
self.log_url_signal = args["signal"]
self.log_stats_signal = args["stats"]

View file

@ -15,19 +15,45 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Common functions used by the CGI and WSGI scripts.
Functions used by the WSGI script.
"""
import sys
import cgi
import os
import threading
from StringIO import StringIO
import locale
import re
import time
import urlparse
import types
from . import configuration, strformat, checker, director, i18n
from . import add_intern_pattern, get_link_pat, init_i18n
from . import url as urlutil
from PyQt4 import QtCore
from . import configuration, strformat, checker, director, \
add_intern_pattern, get_link_pat, init_i18n, url as urlutil
from .logger import Logger
def application(environ, start_response):
"""WSGI interface: start an URL check."""
# the environment variable CONTENT_LENGTH may be empty or missing
try:
request_body_size = int(environ.get('CONTENT_LENGTH', 0))
except ValueError:
request_body_size = 0
# When the method is POST the query string will be sent
# in the HTTP request body which is passed by the WSGI server
# in the file like wsgi.input environment variable.
if request_body_size > 0:
request_body = environ['wsgi.input'].read(request_body_size)
else:
request_body = environ['wsgi.input'].read()
form = cgi.parse_qs(request_body)
status = '200 OK'
start_response(status, get_response_headers())
for output in checklink(form=form, env=environ):
yield output
_logfile = None
@ -46,46 +72,152 @@ class LCFormError (StandardError):
def get_response_headers():
"""Get list of response headers in key-value form."""
return [("Content-type", "text/html"),
("Cache-Control", "no-cache"),
("Pragma:", "no-cache")
]
def startoutput (out=None):
"""Print leading HTML headers to given output stream."""
if out is None:
out = i18n.get_encoded_writer()
for key, value in get_response_headers():
out.write("%s: %s\r\n" % (key, value))
out.write("\r\n")
def formvalue (form, key):
"""Get value with given key from WSGI form."""
field = form.get(key)
if field is not None and hasattr(field, 'value'):
# it's a CGI FormField
field = field.value
else:
# assume WSGI dictionary lists
if isinstance(field, list):
field = field[0]
return field
def checklink (out=None, form=None, env=os.environ):
"""Main cgi function, check the given links and print out the result."""
if out is None:
out = i18n.get_encoded_writer()
class ThreadsafeStringIO (StringIO):
"""Thread-safe String I/O class."""
def __init__(self):
self.buf = []
self.lock = threading.Lock()
self.closed = False
def write (self, data):
self.lock.acquire()
try:
if self.closed:
raise IOError("Write on closed I/O object")
self.buf.append(data)
finally:
self.lock.release()
def get_data (self):
self.lock.acquire()
try:
data = "".join(self.buf)
self.buf = []
return data
finally:
self.lock.release()
def close (self):
self.lock.acquire()
try:
self.buf = []
self.closed = True
finally:
self.lock.release()
class SignalLogger (Logger):
"""Use Qt signals for logged URLs and statistics."""
def __init__ (self, **args):
"""Store signals for URL and statistic data."""
super(SignalLogger, self).__init__(**args)
self.log_url_signal = args["signal"]
self.log_stats_signal = args["stats"]
def start_fileoutput (self):
"""Override fileoutput handling of base class."""
pass
def close_fileoutput (self):
"""Override fileoutput handling of base class."""
pass
def log_url (self, url_data):
"""Emit URL data which gets logged in the main window."""
self.log_url_signal.emit(url_data)
def end_output (self):
"""Emit statistic data which gets logged in the main window."""
self.log_stats_signal.emit(self.stats)
class DelegateLogger (QtCore.QObject):
"""Logger using connected signals, delegating output to
another logger class."""
log_url_signal = QtCore.pyqtSignal(object)
log_stats_signal = QtCore.pyqtSignal(object)
def __init__ (self):
"""Connect signals to this instance and init state."""
super(DelegateLogger, self).__init__()
self.log_url_signal.connect(self.log_url)
self.log_stats_signal.connect(self.log_stats)
self.logger = None
self.finished = False
def add_logger (self, logger):
"""Delegate to given logger."""
self.logger = logger
def log_url (self, url_data):
"""Delegate URL logging to internal logger."""
self.logger.log_url(url_data)
def log_stats (self, statistics):
"""Delegate statistic logging to internal logger."""
self.logger.stats = statistics
self.logger.end_output()
self.finished = True
def checklink (form=None, env=os.environ):
"""Validates the CGI form and checks the given links."""
if form is None:
form = {}
try:
checkform(form)
except LCFormError, why:
except LCFormError, errmsg:
logit(form, env)
print_error(out, why)
yield print_error(errmsg)
return
delegate_logger = DelegateLogger()
config = get_configuration(form, delegate_logger)
out = ThreadsafeStringIO()
html_logger = config.logger_new('html', fd=out)
delegate_logger.add_logger(html_logger)
url = strformat.stripurl(formvalue(form, "url"))
aggregate = director.get_aggregate(config)
url_data = checker.get_url_from(url, 0, aggregate)
try:
add_intern_pattern(url_data, config)
except UnicodeError, errmsg:
logit({}, env)
yield print_error(_("URL has unparsable domain name: %s") % errmsg)
return
aggregate.urlqueue.put(url_data)
html_logger.start_output()
# check in background
director.check_urls(aggregate)
while not delegate_logger.finished:
yield out.get_data()
time.sleep(2)
yield out.get_data()
out.close()
def get_configuration(form, logger):
"""Initialize a CGI configuration."""
config = configuration.Configuration()
config["recursionlevel"] = int(formvalue(form, "level"))
config["logger"] = config.logger_new('html', fd=out)
config.logger_add("signal", SignalLogger)
config["logger"] = config.logger_new('signal',
signal=logger.log_url_signal, stats=logger.log_stats_signal)
config["threads"] = 0
if "anchors" in form:
config["anchors"] = True
@ -94,20 +226,7 @@ def checklink (out=None, form=None, env=os.environ):
# avoid checking of local files or other nasty stuff
pat = "!^%s$" % urlutil.safe_url_pattern
config["externlinks"].append(get_link_pat(pat, strict=True))
# start checking
aggregate = director.get_aggregate(config)
get_url_from = checker.get_url_from
url = strformat.stripurl(formvalue(form, "url"))
url_data = get_url_from(url, 0, aggregate)
try:
add_intern_pattern(url_data, config)
except UnicodeError:
logit({}, env)
print_error(out, _("URL has unparsable domain name: %s") % \
sys.exc_info()[1])
return
aggregate.urlqueue.put(url_data)
director.check_urls(aggregate)
return config
def get_host_name (form):
@ -165,7 +284,7 @@ def logit (form, env):
_logfile.write(str(formvalue(form, key))+"\n")
def print_error (out, why):
def print_error (why):
"""Print standard error page."""
s = _("""<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
@ -180,4 +299,5 @@ Errors are logged.
</blockquote>
</body>
</html>""") % why
out.write(s.encode('iso-8859-1', 'ignore'))
return s.encode('iso-8859-1', 'ignore')

View file

@ -18,59 +18,52 @@
Test cgi form routines.
"""
import unittest
import linkcheck.lc_cgi
import wsgiref
import urllib
from StringIO import StringIO
from wsgiref.util import setup_testing_defaults
from linkcheck.lc_cgi import checkform, checklink, LCFormError, application
class Store (object):
"""
Value storing class implementing FieldStorage interface.
"""
def __init__ (self, value):
"""
Store given value.
"""
self.value = value
class TestCgi (unittest.TestCase):
"""
Test cgi routines.
"""
class TestWsgi (unittest.TestCase):
"""Test wsgi application."""
def test_form_valid_url (self):
# Check url validity.
form = {"url": Store("http://www.heise.de/"),
"level": Store("1"),
}
linkcheck.lc_cgi.checkform(form)
form = dict(url="http://www.example.com/", level="1")
checkform(form)
def test_form_empty_url (self):
# Check with empty url.
form = {"url": Store(""),
"level": Store("0"),
}
self.assertRaises(linkcheck.lc_cgi.LCFormError,
linkcheck.lc_cgi.checkform, form)
form = dict(url="", level="0")
self.assertRaises(LCFormError, checkform, form)
def test_form_default_url (self):
# Check with default url.
form = {"url": Store("http://"),
"level": Store("0"),
}
self.assertRaises(linkcheck.lc_cgi.LCFormError,
linkcheck.lc_cgi.checkform, form)
form = dict(url="http://", level="0")
self.assertRaises(LCFormError, checkform, form)
def test_form_invalid_url (self):
# Check url (in)validity.
form = {"url": Store("http://www.foo bar/"),
"level": Store("0"),
}
self.assertRaises(linkcheck.lc_cgi.LCFormError,
linkcheck.lc_cgi.checkform, form)
form = dict(url="http://www.foo bar/", level="0")
self.assertRaises(LCFormError, checkform, form)
def test_checklink (self):
form = {"url": Store("http://www.heise.de/"),
"level": Store("0"),
}
linkcheck.lc_cgi.checklink(form=form)
form = dict(url="http://www.example.com/", level="0")
checklink(form)
def test_application (self):
form = dict(url="http://www.example.com/", level="0")
formdata = urllib.urlencode(form)
environ = {'wsgi.input': StringIO(formdata)}
setup_testing_defaults(environ)
test_response = ""
test_headers = [None]
test_status = [None]
def start_response(status, headers):
test_status[0] = status
test_headers[0] = headers
for chunk in application(environ, start_response):
test_response += chunk
self.assertEqual(test_status[0], '200 OK')
self.assertTrue("Generated by LinkChecker" in test_response)