git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1358 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-07-19 08:45:36 +00:00
parent 916f96cc0d
commit c635234ee6
23 changed files with 17346 additions and 0 deletions

1
bk/HtmlParser/.cvsignore Normal file
View file

@ -0,0 +1 @@
htmlparse.output

24
bk/HtmlParser/Makefile Normal file
View file

@ -0,0 +1,24 @@
# this parser needs flex >= 2.5.xx from http://lex.sf.net/
# for reentrant bison parser support!
FLEX=flex
PYVER=2.3
PYTHON=python$(PYVER)
all: htmllex.c htmlparse.c
%.o: %.c
gcc -g -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
bison htmlparse.y
htmllex.l: htmlparse.h
htmllex.c: htmllex.l htmlsax.h
$(FLEX) htmllex.l
clean:
rm -f htmlparse.c htmlparse.h htmllex.c *.o *.so *.pyc *.pyo *.output
splint:
splint -initallelements +posixlib -I/usr/include/linux -I. -I/usr/include/$(PYTHON) htmllex.c | less

115
bk/HtmlParser/__init__.py Normal file
View file

@ -0,0 +1,115 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""Fast HTML parser module written in C with the following features:
1. Reentrant
As soon as any HTML string data is available, we try to feed it
to the HTML parser. This means that the parser has to scan possible
incomplete data, recognizing as much as it can. Incomplete trailing
data is saved for subsequent callsm, or it is just flushed into the
output buffer with the flush() function.
A reset() brings the parser back to its initial state, throwing away all
buffered data.
2. Coping with HTML syntax errors
The parser recognizes as much as it can and passes the rest
of the data as TEXT tokens.
The scanner only passes complete recognized HTML syntax elements to
the parser. Invalid syntax elements are passed as TEXT. This way we do
not need the bison error recovery.
Incomplete data is rescanned the next time the parser calls yylex() or
when it is being flush()ed.
The following syntax errors will be recognized correctly:
a) missing quotes around attribute values
b) "</...>" end tags in script modus
c) missing ">" in tags
d) invalid tag names
e) invalid characters inside tags or tag attributes
Additionally the parser has the following features:
a) NULL bytes are changed into spaces
b) <!-- ... --> inside a <script> or <style> are not treated as
comments but as DATA
3. Speed
The FLEX code has options to generate a large but fast scanner.
The parser ignores forbidden or unnecessary HTML end tags.
The parser converts tag and attribute names to lower case for easier
matching.
The parser quotes all attribute values.
Python memory management interface is used.
"""
import re
import htmlentitydefs
def _resolve_ascii_entity (mo):
"""Helper function for resolve_entities to resolve one &#XXX;
entity if it is an ASCII character. Else leave as is.
Input is a match object with a "num" group matched.
"""
# convert to number
ent = mo.group()
num = mo.group("num")
if ent.startswith('&#x'):
radix = 16
else:
radix = 10
num = int(num, radix)
# check 7-bit ASCII char range
if 0<=num<=127:
return chr(num)
# not in range
return ent
def resolve_ascii_entities (s):
"""resolve entities in 7-bit ASCII range to eliminate obfuscation"""
return re.sub(r'(?i)&#x?(?P<num>\d+);', _resolve_ascii_entity, s)
def _resolve_html_entity (mo):
"""resolve html entity, helper function for resolve_html_entities"""
return htmlentitydefs.entitydefs.get(mo.group("entity"), mo.group())
def resolve_html_entities (s):
"""resolve html entites in s and return result"""
return re.sub(r'(?i)&(?P<entity>[a-z]+);', _resolve_html_entity, s)
def resolve_entities (s):
"""resolve both html and 7-bit ASCII entites in s and return result"""
return resolve_ascii_entities(resolve_html_entities(s))
def strip_quotes (s):
"""remove possible double or single quotes"""
if len(s) >= 2 and \
((s.startswith("'") and s.endswith("'")) or \
(s.startswith('"') and s.endswith('"'))):
return s[1:-1]
return s

12076
bk/HtmlParser/htmllex.c Normal file

File diff suppressed because it is too large Load diff

1033
bk/HtmlParser/htmllex.l Normal file

File diff suppressed because it is too large Load diff

100
bk/HtmlParser/htmllib.py Normal file
View file

@ -0,0 +1,100 @@
# -*- coding: iso-8859-1 -*-
"""Default handler classes"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import sys
class HtmlPrinter (object):
"""handles all functions by printing the function name and attributes"""
def __init__ (self, fd=sys.stdout):
"""write to given file descriptor"""
self.fd = fd
def _print (self, *attrs):
"""print function attributes"""
print >> self.fd, self.mem, attrs
def _errorfun (self, msg, name):
"""print msg to stderr with name prefix"""
print >> sys.stderr, name, msg
def error (self, msg):
"""signal a filter/parser error"""
self._errorfun(msg, "error:")
def warning (self, msg):
"""signal a filter/parser warning"""
self._errorfun(msg, "warning:")
def fatalError (self, msg):
"""signal a fatal filter/parser error"""
self._errorfun(msg, "fatal error:")
def __getattr__ (self, name):
"""remember the func name"""
self.mem = name
return self._print
class HtmlPrettyPrinter (object):
"""Print out all parsed HTML data"""
def __init__ (self, fd=sys.stdout):
"""write to given file descriptor"""
self.fd = fd
def comment (self, data):
"""print comment"""
self.fd.write("<!--%s-->" % data)
def startElement (self, tag, attrs):
"""print start element"""
self.fd.write("<%s"%tag.replace("/", ""))
for key, val in attrs.iteritems():
if val is None:
self.fd.write(" %s"%key)
else:
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
self.fd.write(">")
def endElement (self, tag):
"""print end element"""
self.fd.write("</%s>" % tag)
def doctype (self, data):
"""print document type"""
self.fd.write("<!DOCTYPE%s>" % data)
def pi (self, data):
"""print pi"""
self.fd.write("<?%s?>" % data)
def cdata (self, data):
"""print cdata"""
self.fd.write("<![CDATA[%s]]>"%data)
def characters (self, data):
"""print characters"""
self.fd.write(data)
def quote_attrval (val):
"""quote a HTML attribute to be able to wrap it in double quotes"""
return val.replace('"', '&quot;')

2045
bk/HtmlParser/htmlparse.c Normal file

File diff suppressed because it is too large Load diff

72
bk/HtmlParser/htmlparse.h Normal file
View file

@ -0,0 +1,72 @@
/* A Bison parser, made by GNU Bison 1.875a. */
/* Skeleton parser for Yacc-like parsing with Bison,
Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* As a special exception, when this file is copied by Bison into a
Bison output file, you may use that output file without restriction.
This special exception was added by the Free Software Foundation
in version 1.24 of Bison. */
/* Tokens. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
/* Put the tokens into the symbol table, so that GDB and other debuggers
know about them. */
enum yytokentype {
T_WAIT = 258,
T_ERROR = 259,
T_TEXT = 260,
T_ELEMENT_START = 261,
T_ELEMENT_START_END = 262,
T_ELEMENT_END = 263,
T_SCRIPT = 264,
T_STYLE = 265,
T_PI = 266,
T_COMMENT = 267,
T_CDATA = 268,
T_DOCTYPE = 269
};
#endif
#define T_WAIT 258
#define T_ERROR 259
#define T_TEXT 260
#define T_ELEMENT_START 261
#define T_ELEMENT_START_END 262
#define T_ELEMENT_END 263
#define T_SCRIPT 264
#define T_STYLE 265
#define T_PI 266
#define T_COMMENT 267
#define T_CDATA 268
#define T_DOCTYPE 269
#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
typedef int YYSTYPE;
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
# define YYSTYPE_IS_DECLARED 1
# define YYSTYPE_IS_TRIVIAL 1
#endif

840
bk/HtmlParser/htmlparse.y Normal file
View file

@ -0,0 +1,840 @@
%{
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* Python module definition of a SAX html parser */
#include "htmlsax.h"
#include "structmember.h"
#include <string.h>
#include <stdio.h>
/* bison type definitions */
#define YYSTYPE PyObject*
#define YYPARSE_PARAM scanner
#define YYLEX_PARAM scanner
/* extern functions found in htmllex.l */
extern int yylex(YYSTYPE* yylvalp, void* scanner);
extern int htmllexInit (void** scanner, UserData* data);
extern int htmllexDebug (void** scanner, int debug);
extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
extern int htmllexStop (void* scanner, UserData* data);
extern int htmllexDestroy (void* scanner);
extern void* yyget_extra(void*);
extern int yyget_lineno(void*);
#define YYERROR_VERBOSE 1
/* standard error reporting, indicating an internal error */
static int yyerror (char* msg) {
fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
return 0;
}
/* parser.resolve_entities */
static PyObject* resolve_entities;
static PyObject* list_dict;
/* macros for easier scanner state manipulation */
/* test whether tag does not need an HTML end tag */
#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
strcmp(tag, "base")==0 || \
strcmp(tag, "basefont")==0 || \
strcmp(tag, "br")==0 || \
strcmp(tag, "col")==0 || \
strcmp(tag, "frame")==0 || \
strcmp(tag, "hr")==0 || \
strcmp(tag, "img")==0 || \
strcmp(tag, "input")==0 || \
strcmp(tag, "isindex")==0 || \
strcmp(tag, "link")==0 || \
strcmp(tag, "meta")==0 || \
strcmp(tag, "param")==0)
/* clear buffer b, returning NULL on error */
#define CLEAR_BUF(b) \
b = PyMem_Resize(b, char, 1); \
if (b==NULL) return NULL; \
(b)[0] = '\0'
/* clear buffer b, returning NULL and decref self on error */
#define CLEAR_BUF_DECREF(self, b) \
b = PyMem_Resize(b, char, 1); \
if (b==NULL) { Py_DECREF(self); return NULL; } \
(b)[0] = '\0'
#define CHECK_ERROR(ud, label) \
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
callback = PyObject_GetAttrString(ud->handler, "error"); \
if (!callback) { error=1; goto label; } \
result = PyObject_CallFunction(callback, "O", ud->error); \
if (!result) { error=1; goto label; } \
}
/* generic callback macro */
#define CALLBACK(ud, attr, format, arg, label) \
if (PyObject_HasAttrString(ud->handler, attr)==1) { \
callback = PyObject_GetAttrString(ud->handler, attr); \
if (callback==NULL) { error=1; goto label; } \
result = PyObject_CallFunction(callback, format, arg); \
if (result==NULL) { error=1; goto label; } \
Py_DECREF(callback); \
Py_DECREF(result); \
callback=result=NULL; \
}
/* set old line and column */
#define SET_OLD_LINECOL \
ud->last_lineno = ud->lineno; \
ud->last_column = ud->column
/* parser type definition */
typedef struct {
PyObject_HEAD
PyObject* handler;
UserData* userData;
void* scanner;
} parser_object;
staticforward PyTypeObject parser_type;
/* use Pythons memory management */
#define malloc PyMem_Malloc
#define realloc PyMem_Realloc
#define free PyMem_Free
%}
/* parser options */
%verbose
%debug
%defines
%output="htmlparse.c"
%pure_parser
/* parser tokens */
%token T_WAIT
%token T_ERROR
%token T_TEXT
%token T_ELEMENT_START
%token T_ELEMENT_START_END
%token T_ELEMENT_END
%token T_SCRIPT
%token T_STYLE
%token T_PI
%token T_COMMENT
%token T_CDATA
%token T_DOCTYPE
/* the finish_ labels are for error recovery */
%%
elements: element {}
| elements element {}
;
element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
| T_ERROR
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
| T_ELEMENT_START
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
PyObject* tag = PyTuple_GET_ITEM($1, 0);
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
int error = 0;
if (!tag || !attrs) { error = 1; goto finish_start; }
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
callback = PyObject_GetAttrString(ud->handler, "startElement");
if (!callback) { error=1; goto finish_start; }
result = PyObject_CallFunction(callback, "OO", tag, attrs);
if (!result) { error=1; goto finish_start; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_start);
finish_start:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(tag);
Py_XDECREF(attrs);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_ELEMENT_START_END
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
PyObject* tag = PyTuple_GET_ITEM($1, 0);
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
int error = 0;
char* tagname;
if (!tag || !attrs) { error = 1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
callback = PyObject_GetAttrString(ud->handler, "startElement");
if (!callback) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "OO", tag, attrs);
if (!result) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
tagname = PyString_AS_STRING(tag);
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
NO_HTML_END_TAG(tagname)) {
callback = PyObject_GetAttrString(ud->handler, "endElement");
if (callback==NULL) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "O", tag);
if (result==NULL) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_start_end);
finish_start_end:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(tag);
Py_XDECREF(attrs);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_ELEMENT_END
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
char* tagname = PyString_AS_STRING($1);
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
NO_HTML_END_TAG(tagname)) {
callback = PyObject_GetAttrString(ud->handler, "endElement");
if (callback==NULL) { error=1; goto finish_end; }
result = PyObject_CallFunction(callback, "O", $1);
if (result==NULL) { error=1; goto finish_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_end);
finish_end:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_COMMENT
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "comment", "O", $1, finish_comment);
CHECK_ERROR(ud, finish_comment);
finish_comment:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_PI
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "pi", "O", $1, finish_pi);
CHECK_ERROR(ud, finish_pi);
finish_pi:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_CDATA
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "cdata", "O", $1, finish_cdata);
CHECK_ERROR(ud, finish_cdata);
finish_cdata:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_DOCTYPE
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "doctype", "O", $1, finish_doctype);
CHECK_ERROR(ud, finish_doctype);
finish_doctype:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_SCRIPT
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "characters", "O", $1, finish_script);
CALLBACK(ud, "endElement", "s", "script", finish_script);
CHECK_ERROR(ud, finish_script);
finish_script:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_STYLE
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "characters", "O", $1, finish_style);
CALLBACK(ud, "endElement", "s", "style", finish_style);
CHECK_ERROR(ud, finish_style);
finish_style:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_TEXT
{
/* $1 is a PyString */
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "characters", "O", $1, finish_characters);
CHECK_ERROR(ud, finish_characters);
finish_characters:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
;
%%
/* disable python memory interface */
#undef malloc
#undef realloc
#undef free
/* create parser object */
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
parser_object* self;
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
{
return NULL;
}
Py_INCREF(Py_None);
self->handler = Py_None;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
if (self->userData == NULL)
{
Py_DECREF(self);
return NULL;
}
self->userData->handler = self->handler;
self->userData->buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->buf);
self->userData->nextpos = 0;
self->userData->bufpos = 0;
self->userData->pos = 0;
self->userData->column = 1;
self->userData->last_column = 1;
self->userData->lineno = 1;
self->userData->last_lineno = 1;
self->userData->tmp_buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
self->userData->tmp_tag = self->userData->tmp_attrname =
self->userData->tmp_attrval = self->userData->tmp_attrs =
self->userData->lexbuf = NULL;
self->userData->resolve_entities = resolve_entities;
self->userData->list_dict = list_dict;
self->userData->exc_type = NULL;
self->userData->exc_val = NULL;
self->userData->exc_tb = NULL;
self->userData->error = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0)
{
Py_DECREF(self);
return NULL;
}
return (PyObject*) self;
}
/* initialize parser object */
static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
PyObject* handler = NULL;
static char *kwlist[] = {"handler", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
return -1;
}
if (handler==NULL) {
return 0;
}
Py_DECREF(self->handler);
Py_INCREF(handler);
self->handler = handler;
self->userData->handler = self->handler;
return 0;
}
/* traverse all used subobjects participating in reference cycles */
static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
if (visit(self->handler, arg) < 0) {
return -1;
}
return 0;
}
/* clear all used subobjects participating in reference cycles */
static int parser_clear (parser_object* self) {
Py_XDECREF(self->handler);
self->handler = NULL;
self->userData->handler = NULL;
return 0;
}
/* free all allocated resources of parser object */
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
parser_clear(self);
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
self->ob_type->tp_free((PyObject*)self);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed (parser_object* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(self->scanner)!=0) {
if (self->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(self->userData->exc_type,
self->userData->exc_val,
self->userData->exc_tb);
}
htmllexStop(self->scanner, self->userData);
return NULL;
}
if (htmllexStop(self->scanner, self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
Py_RETURN_NONE;
}
/* flush all parser buffers */
static PyObject* parser_flush (parser_object* self, PyObject* args) {
int res = 0;
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
/* reset parser variables */
CLEAR_BUF(self->userData->tmp_buf);
Py_XDECREF(self->userData->tmp_tag);
Py_XDECREF(self->userData->tmp_attrs);
Py_XDECREF(self->userData->tmp_attrval);
Py_XDECREF(self->userData->tmp_attrname);
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->userData->bufpos = 0;
if (strlen(self->userData->buf)) {
/* XXX set line, col */
int error = 0;
PyObject* s = PyString_FromString(self->userData->buf);
PyObject* callback = NULL;
PyObject* result = NULL;
/* reset buffer */
CLEAR_BUF(self->userData->buf);
if (s==NULL) { error=1; goto finish_flush; }
if (PyObject_HasAttrString(self->handler, "characters")==1) {
callback = PyObject_GetAttrString(self->handler, "characters");
if (callback==NULL) { error=1; goto finish_flush; }
result = PyObject_CallFunction(callback, "O", s);
if (result==NULL) { error=1; goto finish_flush; }
}
finish_flush:
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(s);
if (error==1) {
return NULL;
}
}
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
return Py_BuildValue("i", res);
}
/* return the current parser line number */
static PyObject* parser_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->lineno);
}
/* return the last parser line number */
static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_lineno);
}
/* return the current parser column number */
static PyObject* parser_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->column);
}
/* return the last parser column number */
static PyObject* parser_last_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_column);
}
/* return the parser position in data stream */
static PyObject* parser_pos (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->pos);
}
/* reset the parser. This will erase all buffered data! */
static PyObject* parser_reset (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
/* reset buffer */
CLEAR_BUF(self->userData->buf);
CLEAR_BUF(self->userData->tmp_buf);
self->userData->bufpos =
self->userData->pos =
self->userData->nextpos = 0;
self->userData->column =
self->userData->last_column =
self->userData->lineno =
self->userData->last_lineno = 1;
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
Py_RETURN_NONE;
}
/* set the debug level, if its >0, debugging is on, =0 means off */
static PyObject* parser_debug (parser_object* self, PyObject* args) {
int debug;
if (!PyArg_ParseTuple(args, "i", &debug)) {
return NULL;
}
yydebug = debug;
debug = htmllexDebug(&(self->scanner), debug);
return PyInt_FromLong((long)debug);
}
static PyObject* parser_gethandler (parser_object* self, void* closure) {
Py_INCREF(self->handler);
return self->handler;
}
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
return -1;
}
Py_DECREF(self->handler);
Py_INCREF(value);
self->handler = value;
self->userData->handler = self->handler;
return 0;
}
/* type interface */
static PyMemberDef parser_members[] = {
{NULL} /* Sentinel */
};
static PyGetSetDef parser_getset[] = {
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
"handler object", NULL},
{NULL} /* Sentinel */
};
static PyMethodDef parser_methods[] = {
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL} /* Sentinel */
};
static PyTypeObject parser_type = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
"bk.HtmlParser.htmlsax.parser", /* tp_name */
sizeof(parser_object), /* tp_size */
0, /* tp_itemsize */
/* methods */
(destructor)parser_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_HAVE_GC, /* tp_flags */
"HTML parser object", /* tp_doc */
(traverseproc)parser_traverse, /* tp_traverse */
(inquiry)parser_clear, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
parser_methods, /* tp_methods */
parser_members, /* tp_members */
parser_getset, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)parser_init, /* tp_init */
0, /* tp_alloc */
parser_new, /* tp_new */
};
/* python module interface
"Create a new HTML parser object with handler (which may be None).\n"
"\n"
"Used callbacks (they don't have to be defined) of a handler are:\n"
"comment(data): <!--data-->\n"
"startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
"endElement(tag): </tag>\n"
"doctype(data): <!DOCTYPE data?>\n"
"pi(name, data=None): <?name data?>\n"
"cdata(data): <![CDATA[data]]>\n"
"characters(data): data\n"
"\n"
"Additionally, there are error and warning callbacks:\n"
"error(msg)\n"
"warning(msg)\n"
"fatalError(msg)\n"},
*/
static PyMethodDef htmlsax_methods[] = {
{NULL} /* Sentinel */
};
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
/* initialization of the htmlsax module */
PyMODINIT_FUNC inithtmlsax (void) {
PyObject* m;
if (PyType_Ready(&parser_type) < 0) {
return;
}
if ((m = Py_InitModule3("htmlsax", htmlsax_methods, "SAX HTML parser routines"))==NULL) {
return;
}
Py_INCREF(&parser_type);
if (PyModule_AddObject(m, "parser", (PyObject *)&parser_type)==-1) {
/* init error */
PyErr_Print();
}
if ((m = PyImport_ImportModule("bk.HtmlParser"))==NULL) {
return;
}
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
return;
}
if ((m = PyImport_ImportModule("bk.containers"))==NULL) {
return;
}
if ((list_dict = PyObject_GetAttrString(m, "ListDict"))==NULL) {
return;
}
}

83
bk/HtmlParser/htmlsax.h Normal file
View file

@ -0,0 +1,83 @@
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef HTMLSAX_H
#define HTMLSAX_H
#include "Python.h"
/* require Python >= 2.3 */
#ifndef PY_VERSION_HEX
#error please install Python >= 2.3
#endif
#if PY_VERSION_HEX < 0x02030000
#error please install Python >= 2.3
#endif
/* this will be in Python 2.4 */
#ifndef Py_RETURN_NONE
#define Py_RETURN_NONE do {Py_INCREF(Py_None); return Py_None;} while (0)
#endif
/* user_data type for SAX calls */
typedef struct {
/* the Python SAX object to issue callbacks */
PyObject* handler;
/* Buffer to store still-to-be-scanned characters. After recognizing
* a complete syntax element, all data up to bufpos will be removed.
* Before scanning you should append new data to this buffer.
*/
char* buf;
/* current position in the buffer counting from zero */
unsigned int bufpos;
/* current position of next syntax element */
unsigned int nextpos;
/* position in the stream of data already seen, counting from zero */
unsigned int pos;
/* line counter, counting from one */
unsigned int lineno;
/* last value of line counter */
unsigned int last_lineno;
/* column counter, counting from zero */
unsigned int column;
/* last value of column counter */
unsigned int last_column;
/* input buffer of lexer, must be deleted when the parsing stops */
void* lexbuf;
/* temporary character buffer */
char* tmp_buf;
/* temporary HTML start or end tag name */
PyObject* tmp_tag;
/* temporary HTML start tag attribute name */
PyObject* tmp_attrname;
/* temporary HTML start tag attribute value */
PyObject* tmp_attrval;
/* temporary HTML start tag attribute list (a SortedDict) */
PyObject* tmp_attrs;
/* parser.resolve_entities */
PyObject* resolve_entities;
/* parser.SortedDict */
PyObject* list_dict;
/* stored Python exception (if error occurred in scanner) */
PyObject* exc_type;
PyObject* exc_val;
PyObject* exc_tb;
/* error string */
PyObject* error;
} UserData;
#endif

54
bk/HtmlParser/s_util.c Normal file
View file

@ -0,0 +1,54 @@
/*
* linux/lib/string.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <string.h>
#if !defined(HAVE_STRLCPY)
/**
* strlcpy - Copy a %NUL terminated string into a sized buffer
* @dst: Where to copy the string to
* @src: Where to copy the string from
* @size: size of destination buffer
*
* Compatible with *BSD: the result is always a valid
* NUL-terminated string that fits in the buffer (unless,
* of course, the buffer size is zero). It does not pad
* out the result like strncpy() does.
*/
size_t strlcpy (char *dst, const char *src, size_t count)
{
size_t ret = strlen(src);
if (count) {
size_t len = (ret >= count) ? count-1 : ret;
memcpy(dst, src, len);
dst[len] = '\0';
}
return ret;
}
#endif /* !HAVE_STRLCPY */
#if !defined(HAVE_STRLCAT)
/**
* strlcat - Append a length-limited, %NUL-terminated string to another
* @dst: The string to be appended to
* @src: The string to append to it
* @size: The size of the destination buffer.
*/
size_t strlcat (char *dest, const char *src, size_t count)
{
size_t dsize = strlen(dest);
size_t len = strlen(src);
size_t res = dsize + len;
dest += dsize;
count -= dsize;
if (len >= count)
len = count-1;
memcpy(dest, src, len);
dest[len] = 0;
return res;
}
#endif /* !HAVE_STRLCAT */

14
bk/HtmlParser/s_util.h Normal file
View file

@ -0,0 +1,14 @@
/*
* linux/lib/string.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#if !defined(HAVE_STRLCPY)
size_t strlcpy(char *dst, const char *src, size_t size);
#endif /* !HAVE_STRLCPY */
#if !defined(HAVE_STRLCAT)
size_t strlcat(char *dst, const char *src, size_t size);
#endif /* !HAVE_STRLCAT */

7
bk/Makefile Normal file
View file

@ -0,0 +1,7 @@
D = ../../bk-python/bk
diff:
diff -BurN . $(D)
update:
cp -r $(D)/* .

1
bk/__init__.py Normal file
View file

@ -0,0 +1 @@
# -*- coding: iso-8859-1 -*-

79
bk/ansicolor.py Normal file
View file

@ -0,0 +1,79 @@
# -*- coding: iso-8859-1 -*-
"""ANSI Color definitions and functions"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import os
# Escape for ANSI colors
AnsiEsc="\x1b[%sm"
# type numbers
AnsiType = {
'bold': '1',
'light': '2',
'underline': '4',
'blink': '5',
'invert': '7',
'concealed': '8',
}
# color numbers
AnsiColor = {
'default': '0',
'black': '30',
'red': '31',
'green': '32',
'yellow': '33',
'blue': '34',
'purple': '35',
'cyan': '36',
'white': '37',
'Black': '40',
'Red': '41',
'Green': '42',
'Yellow': '43',
'Blue': '44',
'Purple': '45',
'Cyan': '46',
'White': '47',
}
# pc speaker beep escape code
Beep = "\007"
def esc_ansicolor (color):
"""convert a named color definition to an escaped ANSI color"""
ctype = ''
if ";" in color:
ctype, color = color.split(";", 1)
ctype = AnsiType.get(ctype, '')+";"
cnum = AnsiColor.get(color, '0')
return AnsiEsc % (ctype+cnum)
AnsiReset = esc_ansicolor("default")
def colorize (text, color=None):
"""return text colorized if TERM is set"""
if (color is not None) and os.environ.get('TERM'):
color = esc_ansicolor(color)
return '%s%s%s' % (color, text, AnsiReset)
else:
return text

218
bk/containers.py Normal file
View file

@ -0,0 +1,218 @@
# -*- coding: iso-8859-1 -*-
"""special container classes"""
# Copyright (C) 2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
class SetList (list):
"""a list that eliminates all duplicates
"""
def append (self, x):
"""append only if not already there"""
if x not in self:
super(SetList, self).append(x)
def extend (self, x):
"""extend while eliminating duplicates by appending item for item"""
for i in x:
self.append(i)
def insert (self, i, x):
"""insert only if not already there"""
if x not in self:
super(SetList, self).insert(i, x)
def __setitem__ (self, key, value):
"""set new value, and eliminate old duplicates (if any)"""
oldvalues = []
for i in range(len(self)):
if self[i]==value:
oldvalues.append(i)
super(SetList, self).__setitem__(key, value)
# remove old duplicates (from last to first)
oldvalues.reverse()
for i in oldvalues:
if i!=key:
del self[key]
class ListDict (dict):
"""a dictionary whose iterators reflect the order in which elements
were added
"""
def __init__ (self):
"""initialize sorted key list"""
# sorted list of keys
self._keys = []
def __setitem__ (self, key, value):
"""add key,value to dict, append key to sorted list"""
if not self.has_key(key):
self._keys.append(key)
super(ListDict, self).__setitem__(key, value)
def __delitem__ (self, key):
"""remove key from dict"""
self._keys.remove(key)
super(ListDict, self).__delitem__(key)
def values (self):
"""return sorted list of values"""
return [self[k] for k in self._keys]
def items (self):
"""return sorted list of items"""
return [(k, self[k]) for k in self._keys]
def keys (self):
"""return sorted list of keys"""
return self._keys[:]
def itervalues (self):
"""return iterator over sorted values"""
return iter(self.values())
def iteritems (self):
"""return iterator over sorted items"""
return iter(self.items())
def iterkeys (self):
"""return iterator over sorted keys"""
return iter(self.keys())
def clear (self):
"""remove all dict entires"""
self._keys = []
super(ListDict, self).clear()
class LRU (object):
"""
Implementation of a length-limited O(1) LRU queue.
Built for and used by PyPE:
http://pype.sourceforge.net
Copyright 2003 Josiah Carlson. (Licensed under the GPL)
"""
class Node (object):
def __init__ (self, prev, me):
self.prev = prev
self.me = me
self.next = None
def __init__ (self, count, pairs=[]):
self.count = max(count, 1)
self.d = {}
self.first = None
self.last = None
for key, value in pairs:
self[key] = value
def __contains__ (self, obj):
return obj in self.d
def has_key (self, obj):
return self.d.has_key(obj)
def __getitem__ (self, obj):
a = self.d[obj].me
self[a[0]] = a[1]
return a[1]
def __setitem__ (self, obj, val):
if obj in self.d:
del self[obj]
nobj = self.Node(self.last, (obj, val))
if self.first is None:
self.first = nobj
if self.last:
self.last.next = nobj
self.last = nobj
self.d[obj] = nobj
if len(self.d) > self.count:
if self.first == self.last:
self.first = None
self.last = None
return
a = self.first
a.next.prev = None
self.first = a.next
a.next = None
del self.d[a.me[0]]
del a
def __delitem__ (self, obj):
nobj = self.d[obj]
if nobj.prev:
nobj.prev.next = nobj.next
else:
self.first = nobj.next
if nobj.next:
nobj.next.prev = nobj.prev
else:
self.last = nobj.prev
del self.d[obj]
def __iter__ (self):
cur = self.first
while cur != None:
cur2 = cur.next
yield cur.me[1]
cur = cur2
def iteritems (self):
cur = self.first
while cur != None:
cur2 = cur.next
yield cur.me
cur = cur2
def iterkeys (self):
return iter(self.d)
def itervalues (self):
for i,j in self.iteritems():
yield j
def keys (self):
return self.d.keys()
def setdefault (self, key, failobj=None):
if not self.has_key(key):
self[key] = failobj
return self[key]
def _main ():
a = LRU(4)
a['1'] = '1'
a['2'] = '2'
a['3'] = '3'
a['4'] = '4'
a['5'] = '5'
for i in a.iteritems():
print i,
print
print a['2']
a['6'] = '6'
for i in a.iteritems():
print i,
print
print a.has_key('1')
print a.has_key('2')

109
bk/i18n.py Normal file
View file

@ -0,0 +1,109 @@
# -*- coding: iso-8859-1 -*-
"""application internationalization support"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# i18n suppport
import os
import locale
import gettext
# default gettext function
_ = lambda s: s
# more supported languages are added in init_gettext
supported_languages = ['en']
default_language = None
def init (domain, directory):
"""initialize this gettext i18n module"""
global _, default_language
try:
_ = gettext.translation(domain, directory).gettext
except IOError:
# keep default gettext function
pass
# get supported languages
for lang in os.listdir(directory):
path = os.path.join(directory, lang)
if not os.path.isdir(path):
continue
if os.path.exists(os.path.join(path, 'LC_MESSAGES', '%s.mo'%domain)):
supported_languages.append(lang)
loc = get_locale()
if loc in supported_languages:
default_language = loc
else:
default_language = "en"
def get_lang (lang):
"""return lang if it is supported, or the default language"""
if lang in supported_languages:
return lang
return default_language
def get_headers_lang (headers):
"""return preferred supported language in given HTTP headers"""
if not headers.has_key('Accept-Language'):
return default_language
languages = headers['Accept-Language'].split(",")
# XXX sort with quality values
languages = [ lang.split(";")[0].strip() for lang in languages ]
for lang in languages:
if lang in supported_languages:
return lang
return default_language
def get_locale ():
"""return current configured locale"""
loc = locale.getdefaultlocale()[0]
if loc is None:
loc = 'C'
loc = locale.normalize(loc)
# split up the locale into its base components
pos = loc.find('@')
if pos >= 0:
loc = loc[:pos]
pos = loc.find('.')
if pos >= 0:
loc = loc[:pos]
pos = loc.find('_')
if pos >= 0:
loc = loc[:pos]
return loc
lang_names = {
'en': u'English',
'de': u'Deutsch',
}
lang_transis = {
'de': {'en': u'German'},
'en': {'de': u'Englisch'},
}
def lang_name (lang):
"""return full name of given language"""
return lang_names[lang]
def lang_trans (lang, curlang):
"""return translated full name of given language"""
return lang_transis[lang][curlang]

114
bk/log.py Normal file
View file

@ -0,0 +1,114 @@
# -*- coding: iso-8859-1 -*-
"""logging and debug functions"""
# Copyright (C) 2003-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
# public api
__all__ = ["debug", "info", "warn", "error", "critical",
"exception", "get_log_file", "set_format", "usedmemory"]
import os
import logging
def iswritable (fname):
"""return True if given file is writable"""
if os.path.isdir(fname) or os.path.islink(fname):
return False
try:
if os.path.exists(fname):
file(fname, 'a').close()
return True
else:
file(fname, 'w').close()
os.remove(fname)
return True
except IOError:
pass
return False
def get_log_file (name, logname, trydirs=[]):
"""get full path name to writeable logfile"""
dirs = []
if os.name =='nt':
dirs.append(os.environ.get("TEMP"))
else:
dirs.append(os.path.join('/', 'var', 'log', name))
dirs.append(os.path.join('/', 'var', 'tmp', name))
dirs.append(os.path.join('/', 'tmp', name))
dirs.append(os.getcwd())
trydirs = trydirs+dirs
for d in trydirs:
fullname = os.path.join(d, logname)
if iswritable(fullname):
return fullname
raise IOError("Could not find writable directory for %s in %s" % (logname, str(trydirs)))
def set_format (handler):
"""set standard format for handler"""
handler.setFormatter(logging.root.handlers[0].formatter)
return handler
def usedmemory ():
"""return string with used memory"""
pid = os.getpid()
fp = file('/proc/%d/status'%pid)
val = 0
try:
for line in fp.readlines():
if line.startswith('VmRSS:'):
val = int(line[6:].strip().split()[0])
finally:
fp.close()
return val
import gc
gc.enable()
# memory leak debugging
#gc.set_debug(gc.DEBUG_LEAK)
def debug (log, msg, *args):
"""log a debug message"""
logging.getLogger(log).debug(msg, *args)
#logging.getLogger(log).info("Mem: %d kB"%usedmemory())
def info (log, msg, *args):
"""log an informational message"""
logging.getLogger(log).info(msg, *args)
def warn (log, msg, *args):
"""log a warning"""
logging.getLogger(log).warn(msg, *args)
def error (log, msg, *args):
"""log an error"""
logging.getLogger(log).error(msg, *args)
def critical (log, msg, *args):
"""log a critical error"""
logging.getLogger(log).critical(msg, *args)
def exception (log, msg, *args):
"""log an exception"""
logging.getLogger(log).exception(msg, *args)

55
bk/mem.py Normal file
View file

@ -0,0 +1,55 @@
# -*- coding: iso-8859-1 -*-
""" Copied from the Python Cookbook recipe at
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222
To find the memory usage in a particular section of code these
functions are typically used as follows:
m0 = memory()
...
m1 = memory(m0)
"""
import os
_proc_status = '/proc/%d/status' % os.getpid()
_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
'KB': 1024.0, 'MB': 1024.0*1024.0}
def _VmB (VmKey):
'''Parse /proc/<pid>/status file for given key.'''
if os.name != 'posix':
# not supported
return 0.0
global _proc_status, _scale
# get pseudo file /proc/<pid>/status
try:
t = open(_proc_status)
v = t.read()
t.close()
except IOError:
# unsupported platform (non-Linux?)
return 0.0
# get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
i = v.index(VmKey)
v = v[i:].split(None, 3) # whitespace
if len(v) < 3:
return 0.0 # invalid format?
# convert Vm value to bytes
return float(v[1]) * _scale[v[2]]
def memory (since=0.0):
'''Return memory usage in bytes.'''
return _VmB('VmSize:') - since
def resident (since=0.0):
'''Return resident memory usage in bytes.'''
return _VmB('VmRSS:') - since
def stacksize (since=0.0):
'''Return stack size in bytes.'''
return _VmB('VmStk:') - since

48
bk/strtime.py Normal file
View file

@ -0,0 +1,48 @@
# -*- coding: iso-8859-1 -*-
"""time to string conversion utility functions"""
# Copyright (C) 2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import time
import bk.i18n
def strtime (t):
"""return ISO 8601 formatted time"""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
strtimezone()
def strduration (duration):
"""return string formatted time duration"""
name = bk.i18n._("seconds")
if duration > 60:
duration = duration / 60
name = bk.i18n._("minutes")
if duration > 60:
duration = duration / 60
name = bk.i18n._("hours")
return " %.3f %s"%(duration, name)
def strtimezone ():
"""return timezone info, %z on some platforms, but not supported on all"""
if time.daylight:
zone = time.altzone
else:
zone = time.timezone
return "%+04d" % int(-zone/3600)

1
bk/tests/__init__.py Normal file
View file

@ -0,0 +1 @@
# -*- coding: iso-8859-1 -*-

View file

@ -0,0 +1,87 @@
# -*- coding: iso-8859-1 -*-
"""test container routines"""
import unittest
import random
import bk.containers
class TestListDict (unittest.TestCase):
def setUp (self):
self.d = bk.containers.ListDict()
def test_insert (self):
self.assert_(not self.d)
self.d[2] = 1
self.d[1] = 2
self.assert_(2 in self.d)
self.assert_(1 in self.d)
def test_delete (self):
self.assert_(not self.d)
self.d[2] = 1
self.d[1] = 2
del self.d[1]
self.assert_(2 in self.d)
self.assert_(1 not in self.d)
def test_update (self):
self.assert_(not self.d)
self.d[2] = 1
self.d[1] = 2
self.d[1] = 1
self.assertEqual(self.d[1], 1)
def test_sorting (self):
self.assert_(not self.d)
toinsert = random.sample(xrange(10000000), 60)
for x in toinsert:
self.d[x] = x
for i, k in enumerate(self.d.keys()):
self.assertEqual(self.d[k], toinsert[i])
class TestSetList (unittest.TestCase):
def setUp (self):
self.l = bk.containers.SetList()
def test_append (self):
self.assert_(not self.l)
self.l.append(1)
self.l.append(1)
self.assertEqual(len(self.l), 1)
def test_append2 (self):
self.assert_(not self.l)
self.l.append(1)
self.l.append(2)
self.l.append(1)
self.assertEqual(len(self.l), 2)
def test_extend (self):
self.assert_(not self.l)
self.l.extend([1, 2, 1])
self.assertEqual(len(self.l), 2)
self.assertEqual(self.l[0], 1)
self.assertEqual(self.l[1], 2)
def test_setitem (self):
self.assert_(not self.l)
self.l.extend([1,2,3])
self.l[1] = 3
self.assertEqual(len(self.l), 2)
self.assertEqual(self.l[0], 1)
self.assertEqual(self.l[1], 3)
def test_suite ():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestListDict))
suite.addTest(unittest.makeSuite(TestSetList))
return suite
if __name__ == '__main__':
unittest.main()

170
bk/tests/test_parser.py Normal file
View file

@ -0,0 +1,170 @@
# -*- coding: iso-8859-1 -*-
import bk.HtmlParser
import bk.HtmlParser.htmlsax
import bk.HtmlParser.htmllib
import cStringIO as StringIO
import unittest
parsetests = [
# start tags
("""<a b="c" >""", """<a b="c">"""),
("""<a b='c' >""", """<a b="c">"""),
("""<a b=c" >""", """<a b="c">"""),
("""<a b=c' >""", """<a b="c'">"""),
("""<a b="c >""", """<a b="c >"""),
("""<a b="" >""", """<a b="">"""),
("""<a b='' >""", """<a b="">"""),
("""<a b=>""", """<a b="">"""),
("""<a b= >""", """<a b="">"""),
("""<a =c>""", """<a c>"""),
("""<a =c >""", """<a c>"""),
("""<a =>""", """<a>"""),
("""<a = >""", """<a>"""),
("""<a b= "c" >""", """<a b="c">"""),
("""<a b ="c" >""", """<a b="c">"""),
("""<a b = "c" >""", """<a b="c">"""),
("""<a >""", """<a>"""),
("""< a>""", """<a>"""),
("""< a >""", """<a>"""),
("""<>""", """<>"""),
("""< >""", """< >"""),
# reduce test
("""<a b="c"><""", """<a b="c"><"""),
("""d>""", """d>"""),
# numbers in tag
("""<h1>bla</h1>""", """<h1>bla</h1>"""),
# more start tags
("""<a b=c"><a b="c">""", """<a b="c"><a b="c">"""),
("""<a b="c><a b="c">""", """<a b="c><a b=" c>"""),
("""<a b=/c/></a><br>""", """<a b="/c/"></a><br>"""),
("""<br/>""", """<br>"""),
("""<a b="50%"><br>""", """<a b="50%"><br>"""),
# comments
("""<!---->""", """<!---->"""),
("""<!-- a - b -->< br>""", """<!-- a - b --><br>"""),
("""<!----->""", """<!----->"""),
("""<!------>""", """<!------>"""),
("""<!------->""", """<!------->"""),
("""<!---- >""", """<!----->"""),
("""<!-- -->""", """<!-- -->"""),
("""<!-- -- >""", """<!-- --->"""),
("""<!---- />-->""", """<!---- />-->"""),
# end tags
("""</a>""", """</a>"""),
("""</ a>""", """</a>"""),
("""</ a >""", """</a>"""),
("""</a >""", """</a>"""),
("""< / a>""", """</a>"""),
("""< /a>""", """</a>"""),
# missing > in end tag
("""</td <td a="b" >""", """</td><td a="b">"""),
# start and end tag
("""<a/>""", """<a></a>"""),
# declaration tags
("""<!DOCtype adrbook SYSTEM "adrbook.dtd">""", """<!DOCTYPE adrbook SYSTEM "adrbook.dtd">"""),
# misc
("""<?xmL version="1.0" encoding="latin1"?>""", """<?xmL version="1.0" encoding="latin1"?>"""),
# javascript
("""<script >\n</script>""", """<script>\n</script>"""),
("""<sCrIpt lang="a">bla </a> fasel</scripT>""", """<script lang="a">bla </a> fasel</script>"""),
# line continuation (Dr. Fun webpage)
("<img bo\\\nrder=0 >", """<img bo rder="0">"""),
# href with $
("""<a href="123$456">""", """<a href="123$456">"""),
# quoting
("""<a href=/ >""", """<a href="/">"""),
("""<a href= />""", """<a href="/">"""),
("""<a href= >""", """<a href="">"""),
("""<a href="'" >""", """<a href="'">"""),
("""<a href='"' >""", """<a href="&quot;">"""),
("""<a href="bla" %]" >""", """<a href="bla">"""),
("""<a href=bla" >""", """<a href="bla">"""),
("""<a onmouseover=MM_swapImage('nav1','','/images/dwnavpoint_over.gif',1);movein(this); b="c">""",
"""<a onmouseover="MM_swapImage('nav1','','/images/dwnavpoint_over.gif',1);movein(this);" b="c">"""),
("""<a onClick=location.href('/index.htm') b="c">""",
"""<a onclick="location.href('/index.htm')" b="c">"""),
# entities
("""<a href="&#109;ailto:" >""", """<a href="mailto:">"""),
# non-ascii characters
("""<Üzgür> fahr </langsamer> ¹²³¼½¬{""",
"""<Üzgür> fahr </langsamer> ¹²³¼½¬{"""),
]
flushtests = [
("<", "<"),
("<a", "<a"),
("<!a", "<!a"),
("<?a", "<?a"),
]
class TestParser (unittest.TestCase):
def setUp (self):
# list of tuples (<test pattern>, <expected parse output>)
self.htmlparser = bk.HtmlParser.htmlsax.parser()
self.htmlparser2 = bk.HtmlParser.htmlsax.parser()
def test_parse (self):
for _in, _out in parsetests:
out = StringIO.StringIO()
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
self.htmlparser.feed(_in)
self.htmlparser.flush()
res = out.getvalue()
self.assertEqual(res, _out)
self.htmlparser.reset()
def test_feed (self):
for _in, _out in parsetests:
out = StringIO.StringIO()
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
for c in _in:
self.htmlparser.feed(c)
self.htmlparser.flush()
res = out.getvalue()
self.assertEqual(res, _out)
self.htmlparser.reset()
def test_interwoven (self):
for _in, _out in parsetests:
out = StringIO.StringIO()
out2 = StringIO.StringIO()
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
self.htmlparser2.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out2)
for c in _in:
self.htmlparser.feed(c)
self.htmlparser2.feed(c)
self.htmlparser.flush()
self.htmlparser2.flush()
res = out.getvalue()
res2 = out2.getvalue()
self.assertEqual(res, _out)
self.assertEqual(res2, _out)
self.htmlparser.reset()
def test_flush (self):
for _in, _out in flushtests:
out = StringIO.StringIO()
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
self.htmlparser.feed(_in)
self.htmlparser.flush()
res = out.getvalue()
self.assertEqual(res, _out)
self.htmlparser.reset()
def test_entities (self):
for c in "abcdefghijklmnopqrstuvwxyz":
self.assertEqual(bk.HtmlParser.resolve_entities("&#%d;"%ord(c)), c)
def test_suite ():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestParser))
return suite
if __name__ == '__main__':
unittest.main()