added

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1358 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-20 22:31:00 +00:00 · 2004-07-19 08:45:36 +00:00 · 2004-07-19 08:45:36 +00:00 · c635234ee6
commit c635234ee6
parent 916f96cc0d
23 changed files with 17346 additions and 0 deletions
--- a/bk/HtmlParser/.cvsignore
+++ b/bk/HtmlParser/.cvsignore
@ -0,0 +1 @@
+htmlparse.output
--- a/bk/HtmlParser/Makefile
+++ b/bk/HtmlParser/Makefile
@ -0,0 +1,24 @@
+# this parser needs flex >= 2.5.xx from http://lex.sf.net/
+# for reentrant bison parser support!
+FLEX=flex
+PYVER=2.3
+PYTHON=python$(PYVER)
+
+all: htmllex.c htmlparse.c
+
+%.o:	%.c
+	gcc -g -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
+
+htmlparse.h htmlparse.c:	htmlparse.y htmlsax.h
+	bison htmlparse.y
+
+htmllex.l:	htmlparse.h
+
+htmllex.c:	htmllex.l htmlsax.h
+	$(FLEX) htmllex.l
+
+clean:
+	rm -f htmlparse.c htmlparse.h htmllex.c *.o *.so *.pyc *.pyo *.output
+
+splint:
+	splint -initallelements +posixlib -I/usr/include/linux -I. -I/usr/include/$(PYTHON) htmllex.c | less
--- a/bk/HtmlParser/init.py
+++ b/bk/HtmlParser/init.py
@ -0,0 +1,115 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2000-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""Fast HTML parser module written in C with the following features:
+
+1. Reentrant
+   
+   As soon as any HTML string data is available, we try to feed it
+   to the HTML parser. This means that the parser has to scan possible
+   incomplete data, recognizing as much as it can. Incomplete trailing
+   data is saved for subsequent callsm, or it is just flushed into the
+   output buffer with the flush() function.
+   A reset() brings the parser back to its initial state, throwing away all
+   buffered data.
+
+2. Coping with HTML syntax errors
+   
+   The parser recognizes as much as it can and passes the rest
+   of the data as TEXT tokens.
+   The scanner only passes complete recognized HTML syntax elements to
+   the parser. Invalid syntax elements are passed as TEXT. This way we do
+   not need the bison error recovery.
+   Incomplete data is rescanned the next time the parser calls yylex() or
+   when it is being flush()ed.
+   
+   The following syntax errors will be recognized correctly:
+   
+   a) missing quotes around attribute values
+   b) "</...>" end tags in script modus
+   c) missing ">" in tags
+   d) invalid tag names
+   e) invalid characters inside tags or tag attributes
+   
+   Additionally the parser has the following features:
+   
+   a) NULL bytes are changed into spaces
+   b) <!-- ... --> inside a <script> or <style> are not treated as
+      comments but as DATA
+
+3. Speed
+   
+   The FLEX code has options to generate a large but fast scanner.
+   The parser ignores forbidden or unnecessary HTML end tags.
+   The parser converts tag and attribute names to lower case for easier
+   matching.
+   The parser quotes all attribute values.
+   Python memory management interface is used.
+
+"""
+
+import re
+import htmlentitydefs
+
+
+def _resolve_ascii_entity (mo):
+    """Helper function for resolve_entities to resolve one &#XXX;
+       entity if it is an ASCII character. Else leave as is.
+       Input is a match object with a "num" group matched.
+    """
+    # convert to number
+    ent = mo.group()
+    num = mo.group("num")
+    if ent.startswith('&#x'):
+        radix = 16
+    else:
+        radix = 10
+    num = int(num, radix)
+    # check 7-bit ASCII char range
+    if 0<=num<=127:
+        return chr(num)
+    # not in range
+    return ent
+
+
+def resolve_ascii_entities (s):
+    """resolve entities in 7-bit ASCII range to eliminate obfuscation"""
+    return re.sub(r'(?i)&#x?(?P<num>\d+);', _resolve_ascii_entity, s)
+
+
+def _resolve_html_entity (mo):
+    """resolve html entity, helper function for resolve_html_entities"""
+    return htmlentitydefs.entitydefs.get(mo.group("entity"), mo.group())
+
+
+def resolve_html_entities (s):
+    """resolve html entites in s and return result"""
+    return re.sub(r'(?i)&(?P<entity>[a-z]+);', _resolve_html_entity, s)
+
+
+def resolve_entities (s):
+    """resolve both html and 7-bit ASCII entites in s and return result"""
+    return resolve_ascii_entities(resolve_html_entities(s))
+
+
+def strip_quotes (s):
+    """remove possible double or single quotes"""
+    if len(s) >= 2 and \
+       ((s.startswith("'") and s.endswith("'")) or \
+        (s.startswith('"') and s.endswith('"'))):
+        return s[1:-1]
+    return s
+
--- a/bk/HtmlParser/htmllex.c
+++ b/bk/HtmlParser/htmllex.c
--- a/bk/HtmlParser/htmllex.l
+++ b/bk/HtmlParser/htmllex.l
--- a/bk/HtmlParser/htmllib.py
+++ b/bk/HtmlParser/htmllib.py
@ -0,0 +1,100 @@
+# -*- coding: iso-8859-1 -*-
+"""Default handler classes"""
+# Copyright (C) 2000-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import sys
+
+
+class HtmlPrinter (object):
+    """handles all functions by printing the function name and attributes"""
+
+    def __init__ (self, fd=sys.stdout):
+        """write to given file descriptor"""
+        self.fd = fd
+
+    def _print (self, *attrs):
+        """print function attributes"""
+        print >> self.fd, self.mem, attrs
+
+    def _errorfun (self, msg, name):
+        """print msg to stderr with name prefix"""
+        print >> sys.stderr, name, msg
+
+    def error (self, msg):
+        """signal a filter/parser error"""
+        self._errorfun(msg, "error:")
+
+    def warning (self, msg):
+        """signal a filter/parser warning"""
+        self._errorfun(msg, "warning:")
+
+    def fatalError (self, msg):
+        """signal a fatal filter/parser error"""
+        self._errorfun(msg, "fatal error:")
+
+    def __getattr__ (self, name):
+        """remember the func name"""
+        self.mem = name
+        return self._print
+
+
+class HtmlPrettyPrinter (object):
+    """Print out all parsed HTML data"""
+
+    def __init__ (self, fd=sys.stdout):
+        """write to given file descriptor"""
+        self.fd = fd
+
+    def comment (self, data):
+        """print comment"""
+        self.fd.write("<!--%s-->" % data)
+
+    def startElement (self, tag, attrs):
+        """print start element"""
+        self.fd.write("<%s"%tag.replace("/", ""))
+        for key, val in attrs.iteritems():
+            if val is None:
+                self.fd.write(" %s"%key)
+            else:
+                self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
+        self.fd.write(">")
+
+    def endElement (self, tag):
+        """print end element"""
+        self.fd.write("</%s>" % tag)
+
+    def doctype (self, data):
+        """print document type"""
+        self.fd.write("<!DOCTYPE%s>" % data)
+
+    def pi (self, data):
+        """print pi"""
+        self.fd.write("<?%s?>" % data)
+
+    def cdata (self, data):
+        """print cdata"""
+        self.fd.write("<![CDATA[%s]]>"%data)
+
+    def characters (self, data):
+        """print characters"""
+        self.fd.write(data)
+
+
+def quote_attrval (val):
+    """quote a HTML attribute to be able to wrap it in double quotes"""
+    return val.replace('"', '&quot;')
+
--- a/bk/HtmlParser/htmlparse.c
+++ b/bk/HtmlParser/htmlparse.c
--- a/bk/HtmlParser/htmlparse.h
+++ b/bk/HtmlParser/htmlparse.h
@ -0,0 +1,72 @@
+/* A Bison parser, made by GNU Bison 1.875a.  */
+
+/* Skeleton parser for Yacc-like parsing with Bison,
+   Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+/* As a special exception, when this file is copied by Bison into a
+   Bison output file, you may use that output file without restriction.
+   This special exception was added by the Free Software Foundation
+   in version 1.24 of Bison.  */
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     T_WAIT = 258,
+     T_ERROR = 259,
+     T_TEXT = 260,
+     T_ELEMENT_START = 261,
+     T_ELEMENT_START_END = 262,
+     T_ELEMENT_END = 263,
+     T_SCRIPT = 264,
+     T_STYLE = 265,
+     T_PI = 266,
+     T_COMMENT = 267,
+     T_CDATA = 268,
+     T_DOCTYPE = 269
+   };
+#endif
+#define T_WAIT 258
+#define T_ERROR 259
+#define T_TEXT 260
+#define T_ELEMENT_START 261
+#define T_ELEMENT_START_END 262
+#define T_ELEMENT_END 263
+#define T_SCRIPT 264
+#define T_STYLE 265
+#define T_PI 266
+#define T_COMMENT 267
+#define T_CDATA 268
+#define T_DOCTYPE 269
+
+
+
+
+#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+
+
+
+
--- a/bk/HtmlParser/htmlparse.y
+++ b/bk/HtmlParser/htmlparse.y
@ -0,0 +1,840 @@
+%{
+/* Copyright (C) 2000-2004  Bastian Kleineidam
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+/* Python module definition of a SAX html parser */
+#include "htmlsax.h"
+#include "structmember.h"
+#include <string.h>
+#include <stdio.h>
+
+/* bison type definitions */
+#define YYSTYPE PyObject*
+#define YYPARSE_PARAM scanner
+#define YYLEX_PARAM scanner
+/* extern functions found in htmllex.l */
+extern int yylex(YYSTYPE* yylvalp, void* scanner);
+extern int htmllexInit (void** scanner, UserData* data);
+extern int htmllexDebug (void** scanner, int debug);
+extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
+extern int htmllexStop (void* scanner, UserData* data);
+extern int htmllexDestroy (void* scanner);
+extern void* yyget_extra(void*);
+extern int yyget_lineno(void*);
+#define YYERROR_VERBOSE 1
+
+/* standard error reporting, indicating an internal error */
+static int yyerror (char* msg) {
+    fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
+    return 0;
+}
+
+/* parser.resolve_entities */
+static PyObject* resolve_entities;
+static PyObject* list_dict;
+
+/* macros for easier scanner state manipulation */
+
+/* test whether tag does not need an HTML end tag */
+#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
+    strcmp(tag, "base")==0 || \
+    strcmp(tag, "basefont")==0 || \
+    strcmp(tag, "br")==0 || \
+    strcmp(tag, "col")==0 || \
+    strcmp(tag, "frame")==0 || \
+    strcmp(tag, "hr")==0 || \
+    strcmp(tag, "img")==0 || \
+    strcmp(tag, "input")==0 || \
+    strcmp(tag, "isindex")==0 || \
+    strcmp(tag, "link")==0 || \
+    strcmp(tag, "meta")==0 || \
+    strcmp(tag, "param")==0)
+
+/* clear buffer b, returning NULL on error */
+#define CLEAR_BUF(b) \
+    b = PyMem_Resize(b, char, 1); \
+    if (b==NULL) return NULL; \
+    (b)[0] = '\0'
+
+/* clear buffer b, returning NULL and decref self on error */
+#define CLEAR_BUF_DECREF(self, b) \
+    b = PyMem_Resize(b, char, 1); \
+    if (b==NULL) { Py_DECREF(self); return NULL; } \
+    (b)[0] = '\0'
+
+#define CHECK_ERROR(ud, label) \
+    if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
+	callback = PyObject_GetAttrString(ud->handler, "error"); \
+	if (!callback) { error=1; goto label; } \
+	result = PyObject_CallFunction(callback, "O", ud->error); \
+	if (!result) { error=1; goto label; } \
+    }
+
+/* generic callback macro */
+#define CALLBACK(ud, attr, format, arg, label) \
+    if (PyObject_HasAttrString(ud->handler, attr)==1) { \
+	callback = PyObject_GetAttrString(ud->handler, attr); \
+	if (callback==NULL) { error=1; goto label; } \
+	result = PyObject_CallFunction(callback, format, arg); \
+	if (result==NULL) { error=1; goto label; } \
+	Py_DECREF(callback); \
+	Py_DECREF(result); \
+        callback=result=NULL; \
+    }
+
+/* set old line and column */
+#define SET_OLD_LINECOL \
+    ud->last_lineno = ud->lineno; \
+    ud->last_column = ud->column
+
+/* parser type definition */
+typedef struct {
+    PyObject_HEAD
+    PyObject* handler;
+    UserData* userData;
+    void* scanner;
+} parser_object;
+
+staticforward PyTypeObject parser_type;
+
+/* use Pythons memory management */
+#define malloc PyMem_Malloc
+#define realloc PyMem_Realloc
+#define free PyMem_Free
+
+%}
+
+/* parser options */
+%verbose
+%debug
+%defines
+%output="htmlparse.c"
+%pure_parser
+
+/* parser tokens */
+%token T_WAIT
+%token T_ERROR
+%token T_TEXT
+%token T_ELEMENT_START
+%token T_ELEMENT_START_END
+%token T_ELEMENT_END
+%token T_SCRIPT
+%token T_STYLE
+%token T_PI
+%token T_COMMENT
+%token T_CDATA
+%token T_DOCTYPE
+
+/* the finish_ labels are for error recovery */
+%%
+
+elements: element {}
+    | elements element {}
+    ;
+
+element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
+| T_ERROR
+{
+    /* an error occured in the scanner, the python exception must be set */
+    UserData* ud = yyget_extra(scanner);
+    PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+    YYABORT;
+}
+| T_ELEMENT_START
+{
+    /* $1 is a PyTuple (<tag>, <attrs>)
+       <tag> is a PyString, <attrs> is a PyDict */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    PyObject* tag = PyTuple_GET_ITEM($1, 0);
+    PyObject* attrs = PyTuple_GET_ITEM($1, 1);
+    int error = 0;
+    if (!tag || !attrs) { error = 1; goto finish_start; }
+    if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
+	callback = PyObject_GetAttrString(ud->handler, "startElement");
+	if (!callback) { error=1; goto finish_start; }
+	result = PyObject_CallFunction(callback, "OO", tag, attrs);
+	if (!result) { error=1; goto finish_start; }
+	Py_DECREF(callback);
+        Py_DECREF(result);
+        callback=result=NULL;
+    }
+    CHECK_ERROR(ud, finish_start);
+finish_start:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_XDECREF(tag);
+    Py_XDECREF(attrs);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_ELEMENT_START_END
+{
+    /* $1 is a PyTuple (<tag>, <attrs>)
+       <tag> is a PyString, <attrs> is a PyDict */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    PyObject* tag = PyTuple_GET_ITEM($1, 0);
+    PyObject* attrs = PyTuple_GET_ITEM($1, 1);
+    int error = 0;
+    char* tagname;
+    if (!tag || !attrs) { error = 1; goto finish_start_end; }
+    if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
+	callback = PyObject_GetAttrString(ud->handler, "startElement");
+	if (!callback) { error=1; goto finish_start_end; }
+	result = PyObject_CallFunction(callback, "OO", tag, attrs);
+	if (!result) { error=1; goto finish_start_end; }
+	Py_DECREF(callback);
+        Py_DECREF(result);
+        callback=result=NULL;
+    }
+    tagname = PyString_AS_STRING(tag);
+    if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
+	NO_HTML_END_TAG(tagname)) {
+	callback = PyObject_GetAttrString(ud->handler, "endElement");
+	if (callback==NULL) { error=1; goto finish_start_end; }
+	result = PyObject_CallFunction(callback, "O", tag);
+	if (result==NULL) { error=1; goto finish_start_end; }
+	Py_DECREF(callback);
+        Py_DECREF(result);
+        callback=result=NULL;
+    }
+    CHECK_ERROR(ud, finish_start_end);
+finish_start_end:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_XDECREF(tag);
+    Py_XDECREF(attrs);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_ELEMENT_END
+{
+    /* $1 is a PyString */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    char* tagname = PyString_AS_STRING($1);
+    if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
+	NO_HTML_END_TAG(tagname)) {
+	callback = PyObject_GetAttrString(ud->handler, "endElement");
+	if (callback==NULL) { error=1; goto finish_end; }
+	result = PyObject_CallFunction(callback, "O", $1);
+	if (result==NULL) { error=1; goto finish_end; }
+	Py_DECREF(callback);
+	Py_DECREF(result);
+        callback=result=NULL;
+    }
+    CHECK_ERROR(ud, finish_end);
+finish_end:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_COMMENT
+{
+    /* $1 is a PyString */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    CALLBACK(ud, "comment", "O", $1, finish_comment);
+    CHECK_ERROR(ud, finish_comment);
+finish_comment:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_PI
+{
+    /* $1 is a PyString */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    CALLBACK(ud, "pi", "O", $1, finish_pi);
+    CHECK_ERROR(ud, finish_pi);
+finish_pi:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_CDATA
+{
+    /* $1 is a PyString */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    CALLBACK(ud, "cdata", "O", $1, finish_cdata);
+    CHECK_ERROR(ud, finish_cdata);
+finish_cdata:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_DOCTYPE
+{
+    /* $1 is a PyString */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    CALLBACK(ud, "doctype", "O", $1, finish_doctype);
+    CHECK_ERROR(ud, finish_doctype);
+finish_doctype:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_SCRIPT
+{
+    /* $1 is a PyString */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    CALLBACK(ud, "characters", "O", $1, finish_script);
+    CALLBACK(ud, "endElement", "s", "script", finish_script);
+    CHECK_ERROR(ud, finish_script);
+finish_script:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_STYLE
+{
+    /* $1 is a PyString */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    CALLBACK(ud, "characters", "O", $1, finish_style);
+    CALLBACK(ud, "endElement", "s", "style", finish_style);
+    CHECK_ERROR(ud, finish_style);
+finish_style:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+| T_TEXT
+{
+    /* $1 is a PyString */
+    /* Remember this is also called as a lexer error fallback */
+    UserData* ud = yyget_extra(scanner);
+    PyObject* callback = NULL;
+    PyObject* result = NULL;
+    int error = 0;
+    CALLBACK(ud, "characters", "O", $1, finish_characters);
+    CHECK_ERROR(ud, finish_characters);
+finish_characters:
+    Py_XDECREF(ud->error);
+    ud->error = NULL;
+    Py_XDECREF(callback);
+    Py_XDECREF(result);
+    Py_DECREF($1);
+    if (error) {
+	PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
+	YYABORT;
+    }
+    SET_OLD_LINECOL;
+}
+;
+
+%%
+
+/* disable python memory interface */
+#undef malloc
+#undef realloc
+#undef free
+
+/* create parser object */
+static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
+    parser_object* self;
+    if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
+    {
+        return NULL;
+    }
+    Py_INCREF(Py_None);
+    self->handler = Py_None;
+    /* reset userData */
+    self->userData = PyMem_New(UserData, sizeof(UserData));
+    if (self->userData == NULL)
+    {
+        Py_DECREF(self);
+        return NULL;
+    }
+    self->userData->handler = self->handler;
+    self->userData->buf = NULL;
+    CLEAR_BUF_DECREF(self, self->userData->buf);
+    self->userData->nextpos = 0;
+    self->userData->bufpos = 0;
+    self->userData->pos = 0;
+    self->userData->column = 1;
+    self->userData->last_column = 1;
+    self->userData->lineno = 1;
+    self->userData->last_lineno = 1;
+    self->userData->tmp_buf = NULL;
+    CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
+    self->userData->tmp_tag = self->userData->tmp_attrname =
+        self->userData->tmp_attrval = self->userData->tmp_attrs =
+        self->userData->lexbuf = NULL;
+    self->userData->resolve_entities = resolve_entities;
+    self->userData->list_dict = list_dict;
+    self->userData->exc_type = NULL;
+    self->userData->exc_val = NULL;
+    self->userData->exc_tb = NULL;
+    self->userData->error = NULL;
+    self->scanner = NULL;
+    if (htmllexInit(&(self->scanner), self->userData)!=0)
+    {
+        Py_DECREF(self);
+        return NULL;
+    }
+    return (PyObject*) self;
+}
+
+
+/* initialize parser object */
+static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
+    PyObject* handler = NULL;
+    static char *kwlist[] = {"handler", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
+        return -1;
+    }
+    if (handler==NULL) {
+        return 0;
+    }
+    Py_DECREF(self->handler);
+    Py_INCREF(handler);
+    self->handler = handler;
+    self->userData->handler = self->handler;
+    return 0;
+}
+
+
+/* traverse all used subobjects participating in reference cycles */
+static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
+    if (visit(self->handler, arg) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+
+/* clear all used subobjects participating in reference cycles */
+static int parser_clear (parser_object* self) {
+    Py_XDECREF(self->handler);
+    self->handler = NULL;
+    self->userData->handler = NULL;
+    return 0;
+}
+
+
+/* free all allocated resources of parser object */
+static void parser_dealloc (parser_object* self) {
+    htmllexDestroy(self->scanner);
+    parser_clear(self);
+    PyMem_Del(self->userData->buf);
+    PyMem_Del(self->userData->tmp_buf);
+    PyMem_Del(self->userData);
+    self->ob_type->tp_free((PyObject*)self);
+}
+
+
+/* feed a chunk of data to the parser */
+static PyObject* parser_feed (parser_object* self, PyObject* args) {
+    /* set up the parse string */
+    int slen = 0;
+    char* s = NULL;
+    if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
+	PyErr_SetString(PyExc_TypeError, "string arg required");
+	return NULL;
+    }
+    /* parse */
+    if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
+	PyErr_SetString(PyExc_MemoryError, "could not start scanner");
+ 	return NULL;
+    }
+    if (yyparse(self->scanner)!=0) {
+        if (self->userData->exc_type!=NULL) {
+            /* note: we give away these objects, so don't decref */
+            PyErr_Restore(self->userData->exc_type,
+        		  self->userData->exc_val,
+        		  self->userData->exc_tb);
+        }
+        htmllexStop(self->scanner, self->userData);
+        return NULL;
+    }
+    if (htmllexStop(self->scanner, self->userData)!=0) {
+	PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
+	return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
+/* flush all parser buffers */
+static PyObject* parser_flush (parser_object* self, PyObject* args) {
+    int res = 0;
+    if (!PyArg_ParseTuple(args, "")) {
+	PyErr_SetString(PyExc_TypeError, "no args required");
+        return NULL;
+    }
+    /* reset parser variables */
+    CLEAR_BUF(self->userData->tmp_buf);
+    Py_XDECREF(self->userData->tmp_tag);
+    Py_XDECREF(self->userData->tmp_attrs);
+    Py_XDECREF(self->userData->tmp_attrval);
+    Py_XDECREF(self->userData->tmp_attrname);
+    self->userData->tmp_tag = self->userData->tmp_attrs =
+	self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
+    self->userData->bufpos = 0;
+    if (strlen(self->userData->buf)) {
+        /* XXX set line, col */
+        int error = 0;
+	PyObject* s = PyString_FromString(self->userData->buf);
+	PyObject* callback = NULL;
+	PyObject* result = NULL;
+	/* reset buffer */
+	CLEAR_BUF(self->userData->buf);
+	if (s==NULL) { error=1; goto finish_flush; }
+	if (PyObject_HasAttrString(self->handler, "characters")==1) {
+	    callback = PyObject_GetAttrString(self->handler, "characters");
+	    if (callback==NULL) { error=1; goto finish_flush; }
+	    result = PyObject_CallFunction(callback, "O", s);
+	    if (result==NULL) { error=1; goto finish_flush; }
+	}
+    finish_flush:
+	Py_XDECREF(callback);
+	Py_XDECREF(result);
+	Py_XDECREF(s);
+	if (error==1) {
+	    return NULL;
+	}
+    }
+    if (htmllexDestroy(self->scanner)!=0) {
+        PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
+        return NULL;
+    }
+    self->scanner = NULL;
+    if (htmllexInit(&(self->scanner), self->userData)!=0) {
+        PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
+        return NULL;
+    }
+    return Py_BuildValue("i", res);
+}
+
+
+/* return the current parser line number */
+static PyObject* parser_lineno (parser_object* self, PyObject* args) {
+    if (!PyArg_ParseTuple(args, "")) {
+	PyErr_SetString(PyExc_TypeError, "no args required");
+        return NULL;
+    }
+    return Py_BuildValue("i", self->userData->lineno);
+}
+
+
+/* return the last parser line number */
+static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
+    if (!PyArg_ParseTuple(args, "")) {
+	PyErr_SetString(PyExc_TypeError, "no args required");
+        return NULL;
+    }
+    return Py_BuildValue("i", self->userData->last_lineno);
+}
+
+
+/* return the current parser column number */
+static PyObject* parser_column (parser_object* self, PyObject* args) {
+    if (!PyArg_ParseTuple(args, "")) {
+	PyErr_SetString(PyExc_TypeError, "no args required");
+        return NULL;
+    }
+    return Py_BuildValue("i", self->userData->column);
+}
+
+
+/* return the last parser column number */
+static PyObject* parser_last_column (parser_object* self, PyObject* args) {
+    if (!PyArg_ParseTuple(args, "")) {
+	PyErr_SetString(PyExc_TypeError, "no args required");
+        return NULL;
+    }
+    return Py_BuildValue("i", self->userData->last_column);
+}
+
+
+/* return the parser position in data stream */
+static PyObject* parser_pos (parser_object* self, PyObject* args) {
+    if (!PyArg_ParseTuple(args, "")) {
+	PyErr_SetString(PyExc_TypeError, "no args required");
+        return NULL;
+    }
+    return Py_BuildValue("i", self->userData->pos);
+}
+
+
+/* reset the parser. This will erase all buffered data! */
+static PyObject* parser_reset (parser_object* self, PyObject* args) {
+    if (!PyArg_ParseTuple(args, "")) {
+	PyErr_SetString(PyExc_TypeError, "no args required");
+	return NULL;
+    }
+    if (htmllexDestroy(self->scanner)!=0) {
+        PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
+        return NULL;
+    }
+    /* reset buffer */
+    CLEAR_BUF(self->userData->buf);
+    CLEAR_BUF(self->userData->tmp_buf);
+    self->userData->bufpos =
+        self->userData->pos =
+        self->userData->nextpos = 0;
+    self->userData->column =
+	self->userData->last_column =
+	self->userData->lineno =
+	self->userData->last_lineno = 1;
+    self->userData->tmp_tag = self->userData->tmp_attrs =
+        self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
+    self->scanner = NULL;
+    if (htmllexInit(&(self->scanner), self->userData)!=0) {
+        PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
+/* set the debug level, if its >0, debugging is on, =0 means off */
+static PyObject* parser_debug (parser_object* self, PyObject* args) {
+    int debug;
+    if (!PyArg_ParseTuple(args, "i", &debug)) {
+        return NULL;
+    }
+    yydebug = debug;
+    debug = htmllexDebug(&(self->scanner), debug);
+    return PyInt_FromLong((long)debug);
+}
+
+
+static PyObject* parser_gethandler (parser_object* self, void* closure) {
+    Py_INCREF(self->handler);
+    return self->handler;
+}
+
+static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
+    if (value == NULL) {
+       PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
+       return -1;
+    }
+    Py_DECREF(self->handler);
+    Py_INCREF(value);
+    self->handler = value;
+    self->userData->handler = self->handler;
+    return 0;
+}
+
+/* type interface */
+
+static PyMemberDef parser_members[] = {
+    {NULL}  /* Sentinel */
+};
+
+static PyGetSetDef parser_getset[] = {
+    {"handler", (getter)parser_gethandler, (setter)parser_sethandler,
+     "handler object", NULL},
+    {NULL}  /* Sentinel */
+};
+
+static PyMethodDef parser_methods[] = {
+    {"feed",  (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
+    {"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
+    {"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
+    {"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
+    {"lineno",      (PyCFunction)parser_lineno,      METH_VARARGS, "get the current line number"},
+    {"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
+    {"column",      (PyCFunction)parser_column,      METH_VARARGS, "get the current column"},
+    {"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
+    {"pos",         (PyCFunction)parser_pos,         METH_VARARGS, "get the current scanner position"},
+    {NULL} /* Sentinel */
+};
+
+
+static PyTypeObject parser_type = {
+    PyObject_HEAD_INIT(NULL)
+    0,              /* ob_size */
+    "bk.HtmlParser.htmlsax.parser",      /* tp_name */
+    sizeof(parser_object), /* tp_size */
+    0,              /* tp_itemsize */
+    /* methods */
+    (destructor)parser_dealloc, /* tp_dealloc */
+    0,              /* tp_print */
+    0,              /* tp_getattr */
+    0,              /* tp_setattr */
+    0,              /* tp_compare */
+    0,              /* tp_repr */
+    0,              /* tp_as_number */
+    0,              /* tp_as_sequence */
+    0,              /* tp_as_mapping */
+    0,              /* tp_hash */
+    0,              /* tp_call */
+    0,              /* tp_str */
+    0,              /* tp_getattro */
+    0,              /* tp_setattro */
+    0,              /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 
+      Py_TPFLAGS_HAVE_GC, /* tp_flags */
+    "HTML parser object", /* tp_doc */
+    (traverseproc)parser_traverse, /* tp_traverse */
+    (inquiry)parser_clear, /* tp_clear */
+    0,              /* tp_richcompare */
+    0,              /* tp_weaklistoffset */
+    0,              /* tp_iter */
+    0,              /* tp_iternext */
+    parser_methods, /* tp_methods */
+    parser_members, /* tp_members */
+    parser_getset,  /* tp_getset */
+    0,              /* tp_base */
+    0,              /* tp_dict */
+    0,              /* tp_descr_get */
+    0,              /* tp_descr_set */
+    0,              /* tp_dictoffset */
+    (initproc)parser_init,  /* tp_init */
+    0,              /* tp_alloc */
+    parser_new,     /* tp_new */
+};
+
+
+/* python module interface 
+     "Create a new HTML parser object with handler (which may be None).\n"
+     "\n"
+     "Used callbacks (they don't have to be defined) of a handler are:\n"
+     "comment(data): <!--data-->\n"
+     "startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
+     "endElement(tag): </tag>\n"
+     "doctype(data): <!DOCTYPE data?>\n"
+     "pi(name, data=None): <?name data?>\n"
+     "cdata(data): <![CDATA[data]]>\n"
+     "characters(data): data\n"
+     "\n"
+     "Additionally, there are error and warning callbacks:\n"
+     "error(msg)\n"
+     "warning(msg)\n"
+     "fatalError(msg)\n"},
+
+*/
+
+static PyMethodDef htmlsax_methods[] = {
+    {NULL} /* Sentinel */
+};
+
+
+#ifndef PyMODINIT_FUNC	/* declarations for DLL import/export */
+#define PyMODINIT_FUNC void
+#endif
+/* initialization of the htmlsax module */
+PyMODINIT_FUNC inithtmlsax (void) {
+    PyObject* m;
+    if (PyType_Ready(&parser_type) < 0) {
+        return;
+    }
+    if ((m = Py_InitModule3("htmlsax", htmlsax_methods, "SAX HTML parser routines"))==NULL) {
+        return;
+    }
+    Py_INCREF(&parser_type);
+    if (PyModule_AddObject(m, "parser", (PyObject *)&parser_type)==-1) {
+        /* init error */
+        PyErr_Print();
+    }
+    if ((m = PyImport_ImportModule("bk.HtmlParser"))==NULL) {
+        return;
+    }
+    if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
+        return;
+    }
+    if ((m = PyImport_ImportModule("bk.containers"))==NULL) {
+        return;
+    }
+    if ((list_dict = PyObject_GetAttrString(m, "ListDict"))==NULL) {
+        return;
+    }
+}
--- a/bk/HtmlParser/htmlsax.h
+++ b/bk/HtmlParser/htmlsax.h
@ -0,0 +1,83 @@
+/* Copyright (C) 2000-2004  Bastian Kleineidam
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+#ifndef HTMLSAX_H
+#define HTMLSAX_H
+
+#include "Python.h"
+
+/* require Python >= 2.3 */
+#ifndef PY_VERSION_HEX
+#error please install Python >= 2.3
+#endif
+
+#if PY_VERSION_HEX < 0x02030000
+#error please install Python >= 2.3
+#endif
+
+/* this will be in Python 2.4 */
+#ifndef Py_RETURN_NONE
+#define Py_RETURN_NONE do {Py_INCREF(Py_None); return Py_None;} while (0)
+#endif
+
+/* user_data type for SAX calls */
+typedef struct {
+    /* the Python SAX object to issue callbacks */
+    PyObject* handler;
+    /* Buffer to store still-to-be-scanned characters. After recognizing
+     * a complete syntax element, all data up to bufpos will be removed.
+     * Before scanning you should append new data to this buffer.
+     */
+    char* buf;
+    /* current position in the buffer counting from zero */
+    unsigned int bufpos;
+    /* current position of next syntax element */
+    unsigned int nextpos;
+    /* position in the stream of data already seen, counting from zero */
+    unsigned int pos;
+    /* line counter, counting from one */
+    unsigned int lineno;
+    /* last value of line counter */
+    unsigned int last_lineno;
+    /* column counter, counting from zero */
+    unsigned int column;
+    /* last value of column counter */
+    unsigned int last_column;
+    /* input buffer of lexer, must be deleted when the parsing stops */
+    void* lexbuf;
+    /* temporary character buffer */
+    char* tmp_buf;
+    /* temporary HTML start or end tag name */
+    PyObject* tmp_tag;
+    /* temporary HTML start tag attribute name */
+    PyObject* tmp_attrname;
+    /* temporary HTML start tag attribute value */
+    PyObject* tmp_attrval;
+    /* temporary HTML start tag attribute list (a SortedDict) */
+    PyObject* tmp_attrs;
+    /* parser.resolve_entities */
+    PyObject* resolve_entities;
+    /* parser.SortedDict */
+    PyObject* list_dict;
+    /* stored Python exception (if error occurred in scanner) */
+    PyObject* exc_type;
+    PyObject* exc_val;
+    PyObject* exc_tb;
+    /* error string */
+    PyObject* error;
+} UserData;
+
+#endif
--- a/bk/HtmlParser/s_util.c
+++ b/bk/HtmlParser/s_util.c
@ -0,0 +1,54 @@
+/*
+ *  linux/lib/string.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+#include <string.h>
+
+#if !defined(HAVE_STRLCPY)
+/**
+ * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * @dst: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy (char *dst, const char *src, size_t count)
+{
+    size_t ret = strlen(src);
+
+    if (count) {
+        size_t len = (ret >= count) ? count-1 : ret;
+        memcpy(dst, src, len);
+        dst[len] = '\0';
+    }
+
+    return ret;
+}
+#endif /* !HAVE_STRLCPY */
+
+#if !defined(HAVE_STRLCAT)
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dst: The string to be appended to
+ * @src: The string to append to it
+ * @size: The size of the destination buffer.
+ */
+size_t strlcat (char *dest, const char *src, size_t count)
+{
+    size_t dsize = strlen(dest);
+    size_t len = strlen(src);
+    size_t res = dsize + len;
+    dest += dsize;
+    count -= dsize;
+    if (len >= count)
+        len = count-1;
+    memcpy(dest, src, len);
+    dest[len] = 0;
+    return res;
+}
+#endif /* !HAVE_STRLCAT */
--- a/bk/HtmlParser/s_util.h
+++ b/bk/HtmlParser/s_util.h
@ -0,0 +1,14 @@
+/*
+ *  linux/lib/string.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+
+#if !defined(HAVE_STRLCPY)
+size_t strlcpy(char *dst, const char *src, size_t size);
+#endif /* !HAVE_STRLCPY */
+
+#if !defined(HAVE_STRLCAT)
+size_t strlcat(char *dst, const char *src, size_t size);
+#endif /* !HAVE_STRLCAT */
--- a/bk/Makefile
+++ b/bk/Makefile
@ -0,0 +1,7 @@
+D = ../../bk-python/bk
+
+diff:
+	diff -BurN . $(D)
+
+update:
+	cp -r $(D)/* .
--- a/bk/init.py
+++ b/bk/init.py
@ -0,0 +1 @@
+# -*- coding: iso-8859-1 -*-
--- a/bk/ansicolor.py
+++ b/bk/ansicolor.py
@ -0,0 +1,79 @@
+# -*- coding: iso-8859-1 -*-
+"""ANSI Color definitions and functions"""
+# Copyright (C) 2000-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import os
+
+# Escape for ANSI colors
+AnsiEsc="\x1b[%sm"
+
+# type numbers
+AnsiType = {
+    'bold':      '1',
+    'light':     '2',
+    'underline': '4',
+    'blink':     '5',
+    'invert':    '7',
+    'concealed': '8',
+}
+
+# color numbers
+AnsiColor = {
+    'default': '0',
+    'black':   '30',
+    'red':     '31',
+    'green':   '32',
+    'yellow':  '33',
+    'blue':    '34',
+    'purple':  '35',
+    'cyan':    '36',
+    'white':   '37',
+    'Black':   '40',
+    'Red':     '41',
+    'Green':   '42',
+    'Yellow':  '43',
+    'Blue':    '44',
+    'Purple':  '45',
+    'Cyan':    '46',
+    'White':   '47',
+
+}
+
+
+# pc speaker beep escape code
+Beep = "\007"
+
+
+def esc_ansicolor (color):
+    """convert a named color definition to an escaped ANSI color"""
+    ctype = ''
+    if ";" in color:
+        ctype, color = color.split(";", 1)
+        ctype = AnsiType.get(ctype, '')+";"
+    cnum = AnsiColor.get(color, '0')
+    return AnsiEsc % (ctype+cnum)
+
+AnsiReset = esc_ansicolor("default")
+
+
+def colorize (text, color=None):
+    """return text colorized if TERM is set"""
+    if (color is not None) and os.environ.get('TERM'):
+        color = esc_ansicolor(color)
+        return '%s%s%s' % (color, text, AnsiReset)
+    else:
+        return text
--- a/bk/containers.py
+++ b/bk/containers.py
@ -0,0 +1,218 @@
+# -*- coding: iso-8859-1 -*-
+"""special container classes"""
+# Copyright (C) 2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+class SetList (list):
+    """a list that eliminates all duplicates
+    """
+
+    def append (self, x):
+        """append only if not already there"""
+        if x not in self:
+            super(SetList, self).append(x)
+
+    def extend (self, x):
+        """extend while eliminating duplicates by appending item for item"""
+        for i in x:
+            self.append(i)
+
+    def insert (self, i, x):
+        """insert only if not already there"""
+        if x not in self:
+            super(SetList, self).insert(i, x)
+
+    def __setitem__ (self, key, value):
+        """set new value, and eliminate old duplicates (if any)"""
+        oldvalues = []
+        for i in range(len(self)):
+            if self[i]==value:
+                oldvalues.append(i)
+        super(SetList, self).__setitem__(key, value)
+        # remove old duplicates (from last to first)
+        oldvalues.reverse()
+        for i in oldvalues:
+            if i!=key:
+                del self[key]
+
+
+class ListDict (dict):
+    """a dictionary whose iterators reflect the order in which elements
+       were added
+    """
+
+    def __init__ (self):
+        """initialize sorted key list"""
+        # sorted list of keys
+        self._keys = []
+
+    def __setitem__ (self, key, value):
+        """add key,value to dict, append key to sorted list"""
+        if not self.has_key(key):
+            self._keys.append(key)
+        super(ListDict, self).__setitem__(key, value)
+
+    def __delitem__ (self, key):
+        """remove key from dict"""
+        self._keys.remove(key)
+        super(ListDict, self).__delitem__(key)
+
+    def values (self):
+        """return sorted list of values"""
+        return [self[k] for k in self._keys]
+
+    def items (self):
+        """return sorted list of items"""
+        return [(k, self[k]) for k in self._keys]
+
+    def keys (self):
+        """return sorted list of keys"""
+        return self._keys[:]
+
+    def itervalues (self):
+        """return iterator over sorted values"""
+        return iter(self.values())
+
+    def iteritems (self):
+        """return iterator over sorted items"""
+        return iter(self.items())
+
+    def iterkeys (self):
+        """return iterator over sorted keys"""
+        return iter(self.keys())
+
+    def clear (self):
+        """remove all dict entires"""
+        self._keys = []
+        super(ListDict, self).clear()
+
+
+class LRU (object):
+    """
+    Implementation of a length-limited O(1) LRU queue.
+    Built for and used by PyPE:
+    http://pype.sourceforge.net
+    Copyright 2003 Josiah Carlson. (Licensed under the GPL)
+    """
+    class Node (object):
+        def __init__ (self, prev, me):
+            self.prev = prev
+            self.me = me
+            self.next = None
+
+    def __init__ (self, count, pairs=[]):
+        self.count = max(count, 1)
+        self.d = {}
+        self.first = None
+        self.last = None
+        for key, value in pairs:
+            self[key] = value
+
+    def __contains__ (self, obj):
+        return obj in self.d
+
+    def has_key (self, obj):
+        return self.d.has_key(obj)
+
+    def __getitem__ (self, obj):
+        a = self.d[obj].me
+        self[a[0]] = a[1]
+        return a[1]
+
+    def __setitem__ (self, obj, val):
+        if obj in self.d:
+            del self[obj]
+        nobj = self.Node(self.last, (obj, val))
+        if self.first is None:
+            self.first = nobj
+        if self.last:
+            self.last.next = nobj
+        self.last = nobj
+        self.d[obj] = nobj
+        if len(self.d) > self.count:
+            if self.first == self.last:
+                self.first = None
+                self.last = None
+                return
+            a = self.first
+            a.next.prev = None
+            self.first = a.next
+            a.next = None
+            del self.d[a.me[0]]
+            del a
+
+    def __delitem__ (self, obj):
+        nobj = self.d[obj]
+        if nobj.prev:
+            nobj.prev.next = nobj.next
+        else:
+            self.first = nobj.next
+        if nobj.next:
+            nobj.next.prev = nobj.prev
+        else:
+            self.last = nobj.prev
+        del self.d[obj]
+
+    def __iter__ (self):
+        cur = self.first
+        while cur != None:
+            cur2 = cur.next
+            yield cur.me[1]
+            cur = cur2
+
+    def iteritems (self):
+        cur = self.first
+        while cur != None:
+            cur2 = cur.next
+            yield cur.me
+            cur = cur2
+
+    def iterkeys (self):
+        return iter(self.d)
+
+    def itervalues (self):
+        for i,j in self.iteritems():
+            yield j
+
+    def keys (self):
+        return self.d.keys()
+
+    def setdefault (self, key, failobj=None):
+        if not self.has_key(key):
+            self[key] = failobj
+        return self[key]
+
+
+def _main ():
+    a = LRU(4)
+    a['1'] = '1'
+    a['2'] = '2'
+    a['3'] = '3'
+    a['4'] = '4'
+    a['5'] = '5'
+    for i in a.iteritems():
+        print i,
+    print
+    print a['2']
+    a['6'] = '6'
+    for i in a.iteritems():
+        print i,
+    print
+    print a.has_key('1')
+    print a.has_key('2')
+
+
--- a/bk/i18n.py
+++ b/bk/i18n.py
@ -0,0 +1,109 @@
+# -*- coding: iso-8859-1 -*-
+"""application internationalization support"""
+# Copyright (C) 2000-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+# i18n suppport
+import os
+import locale
+import gettext
+
+# default gettext function
+_ = lambda s: s
+
+# more supported languages are added in init_gettext
+supported_languages = ['en']
+default_language = None
+
+def init (domain, directory):
+    """initialize this gettext i18n module"""
+    global _, default_language
+    try:
+        _ = gettext.translation(domain, directory).gettext
+    except IOError:
+        # keep default gettext function
+        pass
+    # get supported languages
+    for lang in os.listdir(directory):
+        path = os.path.join(directory, lang)
+        if not os.path.isdir(path):
+            continue
+        if os.path.exists(os.path.join(path, 'LC_MESSAGES', '%s.mo'%domain)):
+            supported_languages.append(lang)
+    loc = get_locale()
+    if loc in supported_languages:
+        default_language = loc
+    else:
+        default_language = "en"
+
+
+def get_lang (lang):
+    """return lang if it is supported, or the default language"""
+    if lang in supported_languages:
+        return lang
+    return default_language
+
+
+def get_headers_lang (headers):
+    """return preferred supported language in given HTTP headers"""
+    if not headers.has_key('Accept-Language'):
+        return default_language
+    languages = headers['Accept-Language'].split(",")
+    # XXX sort with quality values
+    languages = [ lang.split(";")[0].strip() for lang in languages ]
+    for lang in languages:
+        if lang in supported_languages:
+            return lang
+    return default_language
+
+
+def get_locale ():
+    """return current configured locale"""
+    loc = locale.getdefaultlocale()[0]
+    if loc is None:
+        loc = 'C'
+    loc = locale.normalize(loc)
+    # split up the locale into its base components
+    pos = loc.find('@')
+    if pos >= 0:
+        loc = loc[:pos]
+    pos = loc.find('.')
+    if pos >= 0:
+        loc = loc[:pos]
+    pos = loc.find('_')
+    if pos >= 0:
+        loc = loc[:pos]
+    return loc
+
+
+lang_names = {
+    'en': u'English',
+    'de': u'Deutsch',
+}
+lang_transis = {
+    'de': {'en': u'German'},
+    'en': {'de': u'Englisch'},
+}
+
+def lang_name (lang):
+    """return full name of given language"""
+    return lang_names[lang]
+
+
+def lang_trans (lang, curlang):
+    """return translated full name of given language"""
+    return lang_transis[lang][curlang]
+
--- a/bk/log.py
+++ b/bk/log.py
@ -0,0 +1,114 @@
+# -*- coding: iso-8859-1 -*-
+"""logging and debug functions"""
+# Copyright (C) 2003-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+# public api
+__all__ = ["debug", "info", "warn", "error", "critical",
+           "exception", "get_log_file", "set_format", "usedmemory"]
+
+import os
+import logging
+
+
+def iswritable (fname):
+    """return True if given file is writable"""
+    if os.path.isdir(fname) or os.path.islink(fname):
+        return False
+    try:
+        if os.path.exists(fname):
+            file(fname, 'a').close()
+            return True
+        else:
+            file(fname, 'w').close()
+            os.remove(fname)
+            return True
+    except IOError:
+        pass
+    return False
+
+
+def get_log_file (name, logname, trydirs=[]):
+    """get full path name to writeable logfile"""
+    dirs = []
+    if os.name =='nt':
+        dirs.append(os.environ.get("TEMP"))
+    else:
+        dirs.append(os.path.join('/', 'var', 'log', name))
+        dirs.append(os.path.join('/', 'var', 'tmp', name))
+        dirs.append(os.path.join('/', 'tmp', name))
+    dirs.append(os.getcwd())
+    trydirs = trydirs+dirs
+    for d in trydirs:
+        fullname = os.path.join(d, logname)
+        if iswritable(fullname):
+            return fullname
+    raise IOError("Could not find writable directory for %s in %s" % (logname, str(trydirs)))
+
+
+def set_format (handler):
+    """set standard format for handler"""
+    handler.setFormatter(logging.root.handlers[0].formatter)
+    return handler
+
+
+def usedmemory ():
+    """return string with used memory"""
+    pid = os.getpid()
+    fp = file('/proc/%d/status'%pid)
+    val = 0
+    try:
+        for line in fp.readlines():
+            if line.startswith('VmRSS:'):
+                val = int(line[6:].strip().split()[0])
+    finally:
+        fp.close()
+    return val
+
+
+import gc
+gc.enable()
+# memory leak debugging
+#gc.set_debug(gc.DEBUG_LEAK)
+def debug (log, msg, *args):
+    """log a debug message"""
+    logging.getLogger(log).debug(msg, *args)
+    #logging.getLogger(log).info("Mem: %d kB"%usedmemory())
+
+
+def info (log, msg, *args):
+    """log an informational message"""
+    logging.getLogger(log).info(msg, *args)
+
+
+def warn (log, msg, *args):
+    """log a warning"""
+    logging.getLogger(log).warn(msg, *args)
+
+
+def error (log, msg, *args):
+    """log an error"""
+    logging.getLogger(log).error(msg, *args)
+
+
+def critical (log, msg, *args):
+    """log a critical error"""
+    logging.getLogger(log).critical(msg, *args)
+
+
+def exception (log, msg, *args):
+    """log an exception"""
+    logging.getLogger(log).exception(msg, *args)
--- a/bk/mem.py
+++ b/bk/mem.py
@ -0,0 +1,55 @@
+# -*- coding: iso-8859-1 -*-
+""" Copied from the Python Cookbook recipe at
+    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222
+
+    To find the memory usage in a particular section of code these
+    functions are typically used as follows:
+
+    m0 = memory()
+    ...
+    m1 = memory(m0)
+"""
+
+import os
+
+_proc_status = '/proc/%d/status' % os.getpid()
+
+_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
+          'KB': 1024.0, 'MB': 1024.0*1024.0}
+
+def _VmB (VmKey):
+    '''Parse /proc/<pid>/status file for given key.'''
+    if os.name != 'posix':
+        # not supported
+        return 0.0
+    global _proc_status, _scale
+    # get pseudo file /proc/<pid>/status
+    try:
+        t = open(_proc_status)
+        v = t.read()
+        t.close()
+    except IOError:
+        # unsupported platform (non-Linux?)
+        return 0.0
+    # get VmKey line e.g. 'VmRSS:  9999  kB\n ...'
+    i = v.index(VmKey)
+    v = v[i:].split(None, 3)  # whitespace
+    if len(v) < 3:
+        return 0.0  # invalid format?
+    # convert Vm value to bytes
+    return float(v[1]) * _scale[v[2]]
+
+
+def memory (since=0.0):
+    '''Return memory usage in bytes.'''
+    return _VmB('VmSize:') - since
+
+
+def resident (since=0.0):
+    '''Return resident memory usage in bytes.'''
+    return _VmB('VmRSS:') - since
+
+
+def stacksize (since=0.0):
+    '''Return stack size in bytes.'''
+    return _VmB('VmStk:') - since
--- a/bk/strtime.py
+++ b/bk/strtime.py
@ -0,0 +1,48 @@
+# -*- coding: iso-8859-1 -*-
+"""time to string conversion utility functions"""
+# Copyright (C) 2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import time
+import bk.i18n
+
+
+def strtime (t):
+    """return ISO 8601 formatted time"""
+    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
+           strtimezone()
+
+
+def strduration (duration):
+    """return string formatted time duration"""
+    name = bk.i18n._("seconds")
+    if duration > 60:
+        duration = duration / 60
+        name = bk.i18n._("minutes")
+    if duration > 60:
+        duration = duration / 60
+        name = bk.i18n._("hours")
+    return " %.3f %s"%(duration, name)
+
+
+def strtimezone ():
+    """return timezone info, %z on some platforms, but not supported on all"""
+    if time.daylight:
+        zone = time.altzone
+    else:
+        zone = time.timezone
+    return "%+04d" % int(-zone/3600)
+
--- a/bk/tests/init.py
+++ b/bk/tests/init.py
@ -0,0 +1 @@
+# -*- coding: iso-8859-1 -*-
--- a/bk/tests/test_containers.py
+++ b/bk/tests/test_containers.py
@ -0,0 +1,87 @@
+# -*- coding: iso-8859-1 -*-
+"""test container routines"""
+
+import unittest
+import random
+import bk.containers
+
+
+class TestListDict (unittest.TestCase):
+
+    def setUp (self):
+        self.d = bk.containers.ListDict()
+
+    def test_insert (self):
+        self.assert_(not self.d)
+        self.d[2] = 1
+        self.d[1] = 2
+        self.assert_(2 in self.d)
+        self.assert_(1 in self.d)
+
+    def test_delete (self):
+        self.assert_(not self.d)
+        self.d[2] = 1
+        self.d[1] = 2
+        del self.d[1]
+        self.assert_(2 in self.d)
+        self.assert_(1 not in self.d)
+
+    def test_update (self):
+        self.assert_(not self.d)
+        self.d[2] = 1
+        self.d[1] = 2
+        self.d[1] = 1
+        self.assertEqual(self.d[1], 1)
+
+    def test_sorting (self):
+        self.assert_(not self.d)
+        toinsert = random.sample(xrange(10000000), 60)
+        for x in toinsert:
+            self.d[x] = x
+        for i, k in enumerate(self.d.keys()):
+            self.assertEqual(self.d[k], toinsert[i])
+
+
+class TestSetList (unittest.TestCase):
+
+    def setUp (self):
+        self.l = bk.containers.SetList()
+
+    def test_append (self):
+        self.assert_(not self.l)
+        self.l.append(1)
+        self.l.append(1)
+        self.assertEqual(len(self.l), 1)
+
+    def test_append2 (self):
+        self.assert_(not self.l)
+        self.l.append(1)
+        self.l.append(2)
+        self.l.append(1)
+        self.assertEqual(len(self.l), 2)
+
+    def test_extend (self):
+        self.assert_(not self.l)
+        self.l.extend([1, 2, 1])
+        self.assertEqual(len(self.l), 2)
+        self.assertEqual(self.l[0], 1)
+        self.assertEqual(self.l[1], 2)
+
+    def test_setitem (self):
+        self.assert_(not self.l)
+        self.l.extend([1,2,3])
+        self.l[1] = 3
+        self.assertEqual(len(self.l), 2)
+        self.assertEqual(self.l[0], 1)
+        self.assertEqual(self.l[1], 3)
+
+
+def test_suite ():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestListDict))
+    suite.addTest(unittest.makeSuite(TestSetList))
+    return suite
+
+if __name__ == '__main__':
+    unittest.main()
+
--- a/bk/tests/test_parser.py
+++ b/bk/tests/test_parser.py
@ -0,0 +1,170 @@
+# -*- coding: iso-8859-1 -*-
+
+import bk.HtmlParser
+import bk.HtmlParser.htmlsax
+import bk.HtmlParser.htmllib
+import cStringIO as StringIO
+import unittest
+
+
+parsetests = [
+    # start tags
+    ("""<a  b="c" >""", """<a b="c">"""),
+    ("""<a  b='c' >""", """<a b="c">"""),
+    ("""<a  b=c" >""", """<a b="c">"""),
+    ("""<a  b=c' >""", """<a b="c'">"""),
+    ("""<a  b="c >""", """<a  b="c >"""),
+    ("""<a  b="" >""", """<a b="">"""),
+    ("""<a  b='' >""", """<a b="">"""),
+    ("""<a  b=>""", """<a b="">"""),
+    ("""<a  b= >""", """<a b="">"""),
+    ("""<a  =c>""", """<a c>"""),
+    ("""<a  =c >""", """<a c>"""),
+    ("""<a  =>""", """<a>"""),
+    ("""<a  = >""", """<a>"""),
+    ("""<a  b= "c" >""", """<a b="c">"""),
+    ("""<a  b ="c" >""", """<a b="c">"""),
+    ("""<a  b = "c" >""", """<a b="c">"""),
+    ("""<a >""", """<a>"""),
+    ("""< a>""", """<a>"""),
+    ("""< a >""", """<a>"""),
+    ("""<>""", """<>"""),
+    ("""< >""", """< >"""),
+    # reduce test
+    ("""<a  b="c"><""", """<a b="c"><"""),
+    ("""d>""", """d>"""),
+    # numbers in tag
+    ("""<h1>bla</h1>""", """<h1>bla</h1>"""),
+    # more start tags
+    ("""<a  b=c"><a b="c">""", """<a b="c"><a b="c">"""),
+    ("""<a  b="c><a b="c">""", """<a b="c><a b=" c>"""),
+    ("""<a  b=/c/></a><br>""", """<a b="/c/"></a><br>"""),
+    ("""<br/>""", """<br>"""),
+    ("""<a  b="50%"><br>""", """<a b="50%"><br>"""),
+    # comments
+    ("""<!---->""", """<!---->"""),
+    ("""<!-- a - b -->< br>""", """<!-- a - b --><br>"""),
+    ("""<!----->""", """<!----->"""),
+    ("""<!------>""", """<!------>"""),
+    ("""<!------->""", """<!------->"""),
+    ("""<!---- >""", """<!----->"""),
+    ("""<!-- -->""", """<!-- -->"""),
+    ("""<!-- -- >""", """<!-- --->"""),
+    ("""<!---- />-->""", """<!---- />-->"""),
+    # end tags
+    ("""</a>""", """</a>"""),
+    ("""</ a>""", """</a>"""),
+    ("""</ a >""", """</a>"""),
+    ("""</a >""", """</a>"""),
+    ("""< / a>""", """</a>"""),
+    ("""< /a>""", """</a>"""),
+    # missing > in end tag
+    ("""</td <td  a="b" >""", """</td><td a="b">"""),
+    # start and end tag
+    ("""<a/>""", """<a></a>"""),
+    # declaration tags
+    ("""<!DOCtype adrbook SYSTEM "adrbook.dtd">""", """<!DOCTYPE adrbook SYSTEM "adrbook.dtd">"""),
+    # misc
+    ("""<?xmL version="1.0" encoding="latin1"?>""", """<?xmL version="1.0" encoding="latin1"?>"""),
+    # javascript
+    ("""<script >\n</script>""", """<script>\n</script>"""),
+    ("""<sCrIpt lang="a">bla </a> fasel</scripT>""", """<script lang="a">bla </a> fasel</script>"""),
+    # line continuation (Dr. Fun webpage)
+    ("<img bo\\\nrder=0 >", """<img bo rder="0">"""),
+    # href with $
+    ("""<a href="123$456">""", """<a href="123$456">"""),
+    # quoting
+    ("""<a  href=/ >""", """<a href="/">"""),
+    ("""<a  href= />""", """<a href="/">"""),
+    ("""<a  href= >""", """<a href="">"""),
+    ("""<a  href="'" >""", """<a href="'">"""),
+    ("""<a  href='"' >""", """<a href="&quot;">"""),
+    ("""<a  href="bla" %]" >""", """<a href="bla">"""),
+    ("""<a  href=bla" >""", """<a href="bla">"""),
+    ("""<a onmouseover=MM_swapImage('nav1','','/images/dwnavpoint_over.gif',1);movein(this); b="c">""",
+     """<a onmouseover="MM_swapImage('nav1','','/images/dwnavpoint_over.gif',1);movein(this);" b="c">"""),
+    ("""<a onClick=location.href('/index.htm') b="c">""",
+     """<a onclick="location.href('/index.htm')" b="c">"""),
+    # entities
+    ("""<a  href="&#109;ailto:" >""", """<a href="mailto:">"""),
+    # non-ascii characters
+    ("""<Üzgür> fahr </langsamer> ¹²³¼½¬{""",
+     """<Üzgür> fahr </langsamer> ¹²³¼½¬{"""),
+]
+
+flushtests = [
+    ("<", "<"),
+    ("<a", "<a"),
+    ("<!a", "<!a"),
+    ("<?a", "<?a"),
+]
+
+
+class TestParser (unittest.TestCase):
+
+    def setUp (self):
+        # list of tuples (<test pattern>, <expected parse output>)
+        self.htmlparser = bk.HtmlParser.htmlsax.parser()
+        self.htmlparser2 = bk.HtmlParser.htmlsax.parser()
+
+    def test_parse (self):
+        for _in, _out in parsetests:
+            out = StringIO.StringIO()
+            self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
+            self.htmlparser.feed(_in)
+            self.htmlparser.flush()
+            res = out.getvalue()
+            self.assertEqual(res, _out)
+            self.htmlparser.reset()
+
+    def test_feed (self):
+        for _in, _out in parsetests:
+            out = StringIO.StringIO()
+            self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
+            for c in _in:
+                self.htmlparser.feed(c)
+            self.htmlparser.flush()
+            res = out.getvalue()
+            self.assertEqual(res, _out)
+            self.htmlparser.reset()
+
+    def test_interwoven (self):
+        for _in, _out in parsetests:
+            out = StringIO.StringIO()
+            out2 = StringIO.StringIO()
+            self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
+            self.htmlparser2.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out2)
+            for c in _in:
+                self.htmlparser.feed(c)
+                self.htmlparser2.feed(c)
+            self.htmlparser.flush()
+            self.htmlparser2.flush()
+            res = out.getvalue()
+            res2 = out2.getvalue()
+            self.assertEqual(res, _out)
+            self.assertEqual(res2, _out)
+            self.htmlparser.reset()
+
+    def test_flush (self):
+        for _in, _out in flushtests:
+            out = StringIO.StringIO()
+            self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
+            self.htmlparser.feed(_in)
+            self.htmlparser.flush()
+            res = out.getvalue()
+            self.assertEqual(res, _out)
+            self.htmlparser.reset()
+
+    def test_entities (self):
+        for c in "abcdefghijklmnopqrstuvwxyz":
+            self.assertEqual(bk.HtmlParser.resolve_entities("&#%d;"%ord(c)), c)
+
+
+def test_suite ():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(TestParser))
+    return suite
+
+if __name__ == '__main__':
+    unittest.main()
+