mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-20 22:31:00 +00:00
added
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1358 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
916f96cc0d
commit
c635234ee6
23 changed files with 17346 additions and 0 deletions
1
bk/HtmlParser/.cvsignore
Normal file
1
bk/HtmlParser/.cvsignore
Normal file
|
|
@ -0,0 +1 @@
|
|||
htmlparse.output
|
||||
24
bk/HtmlParser/Makefile
Normal file
24
bk/HtmlParser/Makefile
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
# this parser needs flex >= 2.5.xx from http://lex.sf.net/
|
||||
# for reentrant bison parser support!
|
||||
FLEX=flex
|
||||
PYVER=2.3
|
||||
PYTHON=python$(PYVER)
|
||||
|
||||
all: htmllex.c htmlparse.c
|
||||
|
||||
%.o: %.c
|
||||
gcc -g -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
|
||||
|
||||
htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
|
||||
bison htmlparse.y
|
||||
|
||||
htmllex.l: htmlparse.h
|
||||
|
||||
htmllex.c: htmllex.l htmlsax.h
|
||||
$(FLEX) htmllex.l
|
||||
|
||||
clean:
|
||||
rm -f htmlparse.c htmlparse.h htmllex.c *.o *.so *.pyc *.pyo *.output
|
||||
|
||||
splint:
|
||||
splint -initallelements +posixlib -I/usr/include/linux -I. -I/usr/include/$(PYTHON) htmllex.c | less
|
||||
115
bk/HtmlParser/__init__.py
Normal file
115
bk/HtmlParser/__init__.py
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""Fast HTML parser module written in C with the following features:
|
||||
|
||||
1. Reentrant
|
||||
|
||||
As soon as any HTML string data is available, we try to feed it
|
||||
to the HTML parser. This means that the parser has to scan possible
|
||||
incomplete data, recognizing as much as it can. Incomplete trailing
|
||||
data is saved for subsequent callsm, or it is just flushed into the
|
||||
output buffer with the flush() function.
|
||||
A reset() brings the parser back to its initial state, throwing away all
|
||||
buffered data.
|
||||
|
||||
2. Coping with HTML syntax errors
|
||||
|
||||
The parser recognizes as much as it can and passes the rest
|
||||
of the data as TEXT tokens.
|
||||
The scanner only passes complete recognized HTML syntax elements to
|
||||
the parser. Invalid syntax elements are passed as TEXT. This way we do
|
||||
not need the bison error recovery.
|
||||
Incomplete data is rescanned the next time the parser calls yylex() or
|
||||
when it is being flush()ed.
|
||||
|
||||
The following syntax errors will be recognized correctly:
|
||||
|
||||
a) missing quotes around attribute values
|
||||
b) "</...>" end tags in script modus
|
||||
c) missing ">" in tags
|
||||
d) invalid tag names
|
||||
e) invalid characters inside tags or tag attributes
|
||||
|
||||
Additionally the parser has the following features:
|
||||
|
||||
a) NULL bytes are changed into spaces
|
||||
b) <!-- ... --> inside a <script> or <style> are not treated as
|
||||
comments but as DATA
|
||||
|
||||
3. Speed
|
||||
|
||||
The FLEX code has options to generate a large but fast scanner.
|
||||
The parser ignores forbidden or unnecessary HTML end tags.
|
||||
The parser converts tag and attribute names to lower case for easier
|
||||
matching.
|
||||
The parser quotes all attribute values.
|
||||
Python memory management interface is used.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import htmlentitydefs
|
||||
|
||||
|
||||
def _resolve_ascii_entity (mo):
|
||||
"""Helper function for resolve_entities to resolve one &#XXX;
|
||||
entity if it is an ASCII character. Else leave as is.
|
||||
Input is a match object with a "num" group matched.
|
||||
"""
|
||||
# convert to number
|
||||
ent = mo.group()
|
||||
num = mo.group("num")
|
||||
if ent.startswith('&#x'):
|
||||
radix = 16
|
||||
else:
|
||||
radix = 10
|
||||
num = int(num, radix)
|
||||
# check 7-bit ASCII char range
|
||||
if 0<=num<=127:
|
||||
return chr(num)
|
||||
# not in range
|
||||
return ent
|
||||
|
||||
|
||||
def resolve_ascii_entities (s):
|
||||
"""resolve entities in 7-bit ASCII range to eliminate obfuscation"""
|
||||
return re.sub(r'(?i)&#x?(?P<num>\d+);', _resolve_ascii_entity, s)
|
||||
|
||||
|
||||
def _resolve_html_entity (mo):
|
||||
"""resolve html entity, helper function for resolve_html_entities"""
|
||||
return htmlentitydefs.entitydefs.get(mo.group("entity"), mo.group())
|
||||
|
||||
|
||||
def resolve_html_entities (s):
|
||||
"""resolve html entites in s and return result"""
|
||||
return re.sub(r'(?i)&(?P<entity>[a-z]+);', _resolve_html_entity, s)
|
||||
|
||||
|
||||
def resolve_entities (s):
|
||||
"""resolve both html and 7-bit ASCII entites in s and return result"""
|
||||
return resolve_ascii_entities(resolve_html_entities(s))
|
||||
|
||||
|
||||
def strip_quotes (s):
|
||||
"""remove possible double or single quotes"""
|
||||
if len(s) >= 2 and \
|
||||
((s.startswith("'") and s.endswith("'")) or \
|
||||
(s.startswith('"') and s.endswith('"'))):
|
||||
return s[1:-1]
|
||||
return s
|
||||
|
||||
12076
bk/HtmlParser/htmllex.c
Normal file
12076
bk/HtmlParser/htmllex.c
Normal file
File diff suppressed because it is too large
Load diff
1033
bk/HtmlParser/htmllex.l
Normal file
1033
bk/HtmlParser/htmllex.l
Normal file
File diff suppressed because it is too large
Load diff
100
bk/HtmlParser/htmllib.py
Normal file
100
bk/HtmlParser/htmllib.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Default handler classes"""
|
||||
# Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
class HtmlPrinter (object):
|
||||
"""handles all functions by printing the function name and attributes"""
|
||||
|
||||
def __init__ (self, fd=sys.stdout):
|
||||
"""write to given file descriptor"""
|
||||
self.fd = fd
|
||||
|
||||
def _print (self, *attrs):
|
||||
"""print function attributes"""
|
||||
print >> self.fd, self.mem, attrs
|
||||
|
||||
def _errorfun (self, msg, name):
|
||||
"""print msg to stderr with name prefix"""
|
||||
print >> sys.stderr, name, msg
|
||||
|
||||
def error (self, msg):
|
||||
"""signal a filter/parser error"""
|
||||
self._errorfun(msg, "error:")
|
||||
|
||||
def warning (self, msg):
|
||||
"""signal a filter/parser warning"""
|
||||
self._errorfun(msg, "warning:")
|
||||
|
||||
def fatalError (self, msg):
|
||||
"""signal a fatal filter/parser error"""
|
||||
self._errorfun(msg, "fatal error:")
|
||||
|
||||
def __getattr__ (self, name):
|
||||
"""remember the func name"""
|
||||
self.mem = name
|
||||
return self._print
|
||||
|
||||
|
||||
class HtmlPrettyPrinter (object):
|
||||
"""Print out all parsed HTML data"""
|
||||
|
||||
def __init__ (self, fd=sys.stdout):
|
||||
"""write to given file descriptor"""
|
||||
self.fd = fd
|
||||
|
||||
def comment (self, data):
|
||||
"""print comment"""
|
||||
self.fd.write("<!--%s-->" % data)
|
||||
|
||||
def startElement (self, tag, attrs):
|
||||
"""print start element"""
|
||||
self.fd.write("<%s"%tag.replace("/", ""))
|
||||
for key, val in attrs.iteritems():
|
||||
if val is None:
|
||||
self.fd.write(" %s"%key)
|
||||
else:
|
||||
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
|
||||
self.fd.write(">")
|
||||
|
||||
def endElement (self, tag):
|
||||
"""print end element"""
|
||||
self.fd.write("</%s>" % tag)
|
||||
|
||||
def doctype (self, data):
|
||||
"""print document type"""
|
||||
self.fd.write("<!DOCTYPE%s>" % data)
|
||||
|
||||
def pi (self, data):
|
||||
"""print pi"""
|
||||
self.fd.write("<?%s?>" % data)
|
||||
|
||||
def cdata (self, data):
|
||||
"""print cdata"""
|
||||
self.fd.write("<![CDATA[%s]]>"%data)
|
||||
|
||||
def characters (self, data):
|
||||
"""print characters"""
|
||||
self.fd.write(data)
|
||||
|
||||
|
||||
def quote_attrval (val):
|
||||
"""quote a HTML attribute to be able to wrap it in double quotes"""
|
||||
return val.replace('"', '"')
|
||||
|
||||
2045
bk/HtmlParser/htmlparse.c
Normal file
2045
bk/HtmlParser/htmlparse.c
Normal file
File diff suppressed because it is too large
Load diff
72
bk/HtmlParser/htmlparse.h
Normal file
72
bk/HtmlParser/htmlparse.h
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
/* A Bison parser, made by GNU Bison 1.875a. */
|
||||
|
||||
/* Skeleton parser for Yacc-like parsing with Bison,
|
||||
Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
Boston, MA 02111-1307, USA. */
|
||||
|
||||
/* As a special exception, when this file is copied by Bison into a
|
||||
Bison output file, you may use that output file without restriction.
|
||||
This special exception was added by the Free Software Foundation
|
||||
in version 1.24 of Bison. */
|
||||
|
||||
/* Tokens. */
|
||||
#ifndef YYTOKENTYPE
|
||||
# define YYTOKENTYPE
|
||||
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
||||
know about them. */
|
||||
enum yytokentype {
|
||||
T_WAIT = 258,
|
||||
T_ERROR = 259,
|
||||
T_TEXT = 260,
|
||||
T_ELEMENT_START = 261,
|
||||
T_ELEMENT_START_END = 262,
|
||||
T_ELEMENT_END = 263,
|
||||
T_SCRIPT = 264,
|
||||
T_STYLE = 265,
|
||||
T_PI = 266,
|
||||
T_COMMENT = 267,
|
||||
T_CDATA = 268,
|
||||
T_DOCTYPE = 269
|
||||
};
|
||||
#endif
|
||||
#define T_WAIT 258
|
||||
#define T_ERROR 259
|
||||
#define T_TEXT 260
|
||||
#define T_ELEMENT_START 261
|
||||
#define T_ELEMENT_START_END 262
|
||||
#define T_ELEMENT_END 263
|
||||
#define T_SCRIPT 264
|
||||
#define T_STYLE 265
|
||||
#define T_PI 266
|
||||
#define T_COMMENT 267
|
||||
#define T_CDATA 268
|
||||
#define T_DOCTYPE 269
|
||||
|
||||
|
||||
|
||||
|
||||
#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
|
||||
typedef int YYSTYPE;
|
||||
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
|
||||
# define YYSTYPE_IS_DECLARED 1
|
||||
# define YYSTYPE_IS_TRIVIAL 1
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
840
bk/HtmlParser/htmlparse.y
Normal file
840
bk/HtmlParser/htmlparse.y
Normal file
|
|
@ -0,0 +1,840 @@
|
|||
%{
|
||||
/* Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
/* Python module definition of a SAX html parser */
|
||||
#include "htmlsax.h"
|
||||
#include "structmember.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* bison type definitions */
|
||||
#define YYSTYPE PyObject*
|
||||
#define YYPARSE_PARAM scanner
|
||||
#define YYLEX_PARAM scanner
|
||||
/* extern functions found in htmllex.l */
|
||||
extern int yylex(YYSTYPE* yylvalp, void* scanner);
|
||||
extern int htmllexInit (void** scanner, UserData* data);
|
||||
extern int htmllexDebug (void** scanner, int debug);
|
||||
extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
|
||||
extern int htmllexStop (void* scanner, UserData* data);
|
||||
extern int htmllexDestroy (void* scanner);
|
||||
extern void* yyget_extra(void*);
|
||||
extern int yyget_lineno(void*);
|
||||
#define YYERROR_VERBOSE 1
|
||||
|
||||
/* standard error reporting, indicating an internal error */
|
||||
static int yyerror (char* msg) {
|
||||
fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* parser.resolve_entities */
|
||||
static PyObject* resolve_entities;
|
||||
static PyObject* list_dict;
|
||||
|
||||
/* macros for easier scanner state manipulation */
|
||||
|
||||
/* test whether tag does not need an HTML end tag */
|
||||
#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
|
||||
strcmp(tag, "base")==0 || \
|
||||
strcmp(tag, "basefont")==0 || \
|
||||
strcmp(tag, "br")==0 || \
|
||||
strcmp(tag, "col")==0 || \
|
||||
strcmp(tag, "frame")==0 || \
|
||||
strcmp(tag, "hr")==0 || \
|
||||
strcmp(tag, "img")==0 || \
|
||||
strcmp(tag, "input")==0 || \
|
||||
strcmp(tag, "isindex")==0 || \
|
||||
strcmp(tag, "link")==0 || \
|
||||
strcmp(tag, "meta")==0 || \
|
||||
strcmp(tag, "param")==0)
|
||||
|
||||
/* clear buffer b, returning NULL on error */
|
||||
#define CLEAR_BUF(b) \
|
||||
b = PyMem_Resize(b, char, 1); \
|
||||
if (b==NULL) return NULL; \
|
||||
(b)[0] = '\0'
|
||||
|
||||
/* clear buffer b, returning NULL and decref self on error */
|
||||
#define CLEAR_BUF_DECREF(self, b) \
|
||||
b = PyMem_Resize(b, char, 1); \
|
||||
if (b==NULL) { Py_DECREF(self); return NULL; } \
|
||||
(b)[0] = '\0'
|
||||
|
||||
#define CHECK_ERROR(ud, label) \
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
|
||||
callback = PyObject_GetAttrString(ud->handler, "error"); \
|
||||
if (!callback) { error=1; goto label; } \
|
||||
result = PyObject_CallFunction(callback, "O", ud->error); \
|
||||
if (!result) { error=1; goto label; } \
|
||||
}
|
||||
|
||||
/* generic callback macro */
|
||||
#define CALLBACK(ud, attr, format, arg, label) \
|
||||
if (PyObject_HasAttrString(ud->handler, attr)==1) { \
|
||||
callback = PyObject_GetAttrString(ud->handler, attr); \
|
||||
if (callback==NULL) { error=1; goto label; } \
|
||||
result = PyObject_CallFunction(callback, format, arg); \
|
||||
if (result==NULL) { error=1; goto label; } \
|
||||
Py_DECREF(callback); \
|
||||
Py_DECREF(result); \
|
||||
callback=result=NULL; \
|
||||
}
|
||||
|
||||
/* set old line and column */
|
||||
#define SET_OLD_LINECOL \
|
||||
ud->last_lineno = ud->lineno; \
|
||||
ud->last_column = ud->column
|
||||
|
||||
/* parser type definition */
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
PyObject* handler;
|
||||
UserData* userData;
|
||||
void* scanner;
|
||||
} parser_object;
|
||||
|
||||
staticforward PyTypeObject parser_type;
|
||||
|
||||
/* use Pythons memory management */
|
||||
#define malloc PyMem_Malloc
|
||||
#define realloc PyMem_Realloc
|
||||
#define free PyMem_Free
|
||||
|
||||
%}
|
||||
|
||||
/* parser options */
|
||||
%verbose
|
||||
%debug
|
||||
%defines
|
||||
%output="htmlparse.c"
|
||||
%pure_parser
|
||||
|
||||
/* parser tokens */
|
||||
%token T_WAIT
|
||||
%token T_ERROR
|
||||
%token T_TEXT
|
||||
%token T_ELEMENT_START
|
||||
%token T_ELEMENT_START_END
|
||||
%token T_ELEMENT_END
|
||||
%token T_SCRIPT
|
||||
%token T_STYLE
|
||||
%token T_PI
|
||||
%token T_COMMENT
|
||||
%token T_CDATA
|
||||
%token T_DOCTYPE
|
||||
|
||||
/* the finish_ labels are for error recovery */
|
||||
%%
|
||||
|
||||
elements: element {}
|
||||
| elements element {}
|
||||
;
|
||||
|
||||
element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
|
||||
| T_ERROR
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
| T_ELEMENT_START
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
PyObject* tag = PyTuple_GET_ITEM($1, 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
|
||||
int error = 0;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start; }
|
||||
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "startElement");
|
||||
if (!callback) { error=1; goto finish_start; }
|
||||
result = PyObject_CallFunction(callback, "OO", tag, attrs);
|
||||
if (!result) { error=1; goto finish_start; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
CHECK_ERROR(ud, finish_start);
|
||||
finish_start:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(tag);
|
||||
Py_XDECREF(attrs);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_ELEMENT_START_END
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
PyObject* tag = PyTuple_GET_ITEM($1, 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
|
||||
int error = 0;
|
||||
char* tagname;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "startElement");
|
||||
if (!callback) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "OO", tag, attrs);
|
||||
if (!result) { error=1; goto finish_start_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
tagname = PyString_AS_STRING(tag);
|
||||
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "endElement");
|
||||
if (callback==NULL) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "O", tag);
|
||||
if (result==NULL) { error=1; goto finish_start_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
CHECK_ERROR(ud, finish_start_end);
|
||||
finish_start_end:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(tag);
|
||||
Py_XDECREF(attrs);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_ELEMENT_END
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
char* tagname = PyString_AS_STRING($1);
|
||||
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "endElement");
|
||||
if (callback==NULL) { error=1; goto finish_end; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
CHECK_ERROR(ud, finish_end);
|
||||
finish_end:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_COMMENT
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "comment", "O", $1, finish_comment);
|
||||
CHECK_ERROR(ud, finish_comment);
|
||||
finish_comment:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_PI
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "pi", "O", $1, finish_pi);
|
||||
CHECK_ERROR(ud, finish_pi);
|
||||
finish_pi:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_CDATA
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "cdata", "O", $1, finish_cdata);
|
||||
CHECK_ERROR(ud, finish_cdata);
|
||||
finish_cdata:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_DOCTYPE
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "doctype", "O", $1, finish_doctype);
|
||||
CHECK_ERROR(ud, finish_doctype);
|
||||
finish_doctype:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_SCRIPT
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "characters", "O", $1, finish_script);
|
||||
CALLBACK(ud, "endElement", "s", "script", finish_script);
|
||||
CHECK_ERROR(ud, finish_script);
|
||||
finish_script:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_STYLE
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "characters", "O", $1, finish_style);
|
||||
CALLBACK(ud, "endElement", "s", "style", finish_style);
|
||||
CHECK_ERROR(ud, finish_style);
|
||||
finish_style:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
| T_TEXT
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "characters", "O", $1, finish_characters);
|
||||
CHECK_ERROR(ud, finish_characters);
|
||||
finish_characters:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
SET_OLD_LINECOL;
|
||||
}
|
||||
;
|
||||
|
||||
%%
|
||||
|
||||
/* disable python memory interface */
|
||||
#undef malloc
|
||||
#undef realloc
|
||||
#undef free
|
||||
|
||||
/* create parser object */
|
||||
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
|
||||
parser_object* self;
|
||||
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(Py_None);
|
||||
self->handler = Py_None;
|
||||
/* reset userData */
|
||||
self->userData = PyMem_New(UserData, sizeof(UserData));
|
||||
if (self->userData == NULL)
|
||||
{
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->userData->handler = self->handler;
|
||||
self->userData->buf = NULL;
|
||||
CLEAR_BUF_DECREF(self, self->userData->buf);
|
||||
self->userData->nextpos = 0;
|
||||
self->userData->bufpos = 0;
|
||||
self->userData->pos = 0;
|
||||
self->userData->column = 1;
|
||||
self->userData->last_column = 1;
|
||||
self->userData->lineno = 1;
|
||||
self->userData->last_lineno = 1;
|
||||
self->userData->tmp_buf = NULL;
|
||||
CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
|
||||
self->userData->tmp_tag = self->userData->tmp_attrname =
|
||||
self->userData->tmp_attrval = self->userData->tmp_attrs =
|
||||
self->userData->lexbuf = NULL;
|
||||
self->userData->resolve_entities = resolve_entities;
|
||||
self->userData->list_dict = list_dict;
|
||||
self->userData->exc_type = NULL;
|
||||
self->userData->exc_val = NULL;
|
||||
self->userData->exc_tb = NULL;
|
||||
self->userData->error = NULL;
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0)
|
||||
{
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
return (PyObject*) self;
|
||||
}
|
||||
|
||||
|
||||
/* initialize parser object */
|
||||
static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
|
||||
PyObject* handler = NULL;
|
||||
static char *kwlist[] = {"handler", NULL};
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
|
||||
return -1;
|
||||
}
|
||||
if (handler==NULL) {
|
||||
return 0;
|
||||
}
|
||||
Py_DECREF(self->handler);
|
||||
Py_INCREF(handler);
|
||||
self->handler = handler;
|
||||
self->userData->handler = self->handler;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* traverse all used subobjects participating in reference cycles */
|
||||
static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
|
||||
if (visit(self->handler, arg) < 0) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* clear all used subobjects participating in reference cycles */
|
||||
static int parser_clear (parser_object* self) {
|
||||
Py_XDECREF(self->handler);
|
||||
self->handler = NULL;
|
||||
self->userData->handler = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* free all allocated resources of parser object */
|
||||
static void parser_dealloc (parser_object* self) {
|
||||
htmllexDestroy(self->scanner);
|
||||
parser_clear(self);
|
||||
PyMem_Del(self->userData->buf);
|
||||
PyMem_Del(self->userData->tmp_buf);
|
||||
PyMem_Del(self->userData);
|
||||
self->ob_type->tp_free((PyObject*)self);
|
||||
}
|
||||
|
||||
|
||||
/* feed a chunk of data to the parser */
|
||||
static PyObject* parser_feed (parser_object* self, PyObject* args) {
|
||||
/* set up the parse string */
|
||||
int slen = 0;
|
||||
char* s = NULL;
|
||||
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
|
||||
PyErr_SetString(PyExc_TypeError, "string arg required");
|
||||
return NULL;
|
||||
}
|
||||
/* parse */
|
||||
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
|
||||
return NULL;
|
||||
}
|
||||
if (yyparse(self->scanner)!=0) {
|
||||
if (self->userData->exc_type!=NULL) {
|
||||
/* note: we give away these objects, so don't decref */
|
||||
PyErr_Restore(self->userData->exc_type,
|
||||
self->userData->exc_val,
|
||||
self->userData->exc_tb);
|
||||
}
|
||||
htmllexStop(self->scanner, self->userData);
|
||||
return NULL;
|
||||
}
|
||||
if (htmllexStop(self->scanner, self->userData)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
|
||||
return NULL;
|
||||
}
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
|
||||
/* flush all parser buffers */
|
||||
static PyObject* parser_flush (parser_object* self, PyObject* args) {
|
||||
int res = 0;
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
/* reset parser variables */
|
||||
CLEAR_BUF(self->userData->tmp_buf);
|
||||
Py_XDECREF(self->userData->tmp_tag);
|
||||
Py_XDECREF(self->userData->tmp_attrs);
|
||||
Py_XDECREF(self->userData->tmp_attrval);
|
||||
Py_XDECREF(self->userData->tmp_attrname);
|
||||
self->userData->tmp_tag = self->userData->tmp_attrs =
|
||||
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
|
||||
self->userData->bufpos = 0;
|
||||
if (strlen(self->userData->buf)) {
|
||||
/* XXX set line, col */
|
||||
int error = 0;
|
||||
PyObject* s = PyString_FromString(self->userData->buf);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
/* reset buffer */
|
||||
CLEAR_BUF(self->userData->buf);
|
||||
if (s==NULL) { error=1; goto finish_flush; }
|
||||
if (PyObject_HasAttrString(self->handler, "characters")==1) {
|
||||
callback = PyObject_GetAttrString(self->handler, "characters");
|
||||
if (callback==NULL) { error=1; goto finish_flush; }
|
||||
result = PyObject_CallFunction(callback, "O", s);
|
||||
if (result==NULL) { error=1; goto finish_flush; }
|
||||
}
|
||||
finish_flush:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(s);
|
||||
if (error==1) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
if (htmllexDestroy(self->scanner)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
|
||||
return NULL;
|
||||
}
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("i", res);
|
||||
}
|
||||
|
||||
|
||||
/* return the current parser line number */
|
||||
static PyObject* parser_lineno (parser_object* self, PyObject* args) {
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("i", self->userData->lineno);
|
||||
}
|
||||
|
||||
|
||||
/* return the last parser line number */
|
||||
static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("i", self->userData->last_lineno);
|
||||
}
|
||||
|
||||
|
||||
/* return the current parser column number */
|
||||
static PyObject* parser_column (parser_object* self, PyObject* args) {
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("i", self->userData->column);
|
||||
}
|
||||
|
||||
|
||||
/* return the last parser column number */
|
||||
static PyObject* parser_last_column (parser_object* self, PyObject* args) {
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("i", self->userData->last_column);
|
||||
}
|
||||
|
||||
|
||||
/* return the parser position in data stream */
|
||||
static PyObject* parser_pos (parser_object* self, PyObject* args) {
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("i", self->userData->pos);
|
||||
}
|
||||
|
||||
|
||||
/* reset the parser. This will erase all buffered data! */
|
||||
static PyObject* parser_reset (parser_object* self, PyObject* args) {
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
if (htmllexDestroy(self->scanner)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
|
||||
return NULL;
|
||||
}
|
||||
/* reset buffer */
|
||||
CLEAR_BUF(self->userData->buf);
|
||||
CLEAR_BUF(self->userData->tmp_buf);
|
||||
self->userData->bufpos =
|
||||
self->userData->pos =
|
||||
self->userData->nextpos = 0;
|
||||
self->userData->column =
|
||||
self->userData->last_column =
|
||||
self->userData->lineno =
|
||||
self->userData->last_lineno = 1;
|
||||
self->userData->tmp_tag = self->userData->tmp_attrs =
|
||||
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
|
||||
return NULL;
|
||||
}
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
|
||||
/* set the debug level, if its >0, debugging is on, =0 means off */
|
||||
static PyObject* parser_debug (parser_object* self, PyObject* args) {
|
||||
int debug;
|
||||
if (!PyArg_ParseTuple(args, "i", &debug)) {
|
||||
return NULL;
|
||||
}
|
||||
yydebug = debug;
|
||||
debug = htmllexDebug(&(self->scanner), debug);
|
||||
return PyInt_FromLong((long)debug);
|
||||
}
|
||||
|
||||
|
||||
static PyObject* parser_gethandler (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->handler);
|
||||
return self->handler;
|
||||
}
|
||||
|
||||
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(self->handler);
|
||||
Py_INCREF(value);
|
||||
self->handler = value;
|
||||
self->userData->handler = self->handler;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* type interface */
|
||||
|
||||
static PyMemberDef parser_members[] = {
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyGetSetDef parser_getset[] = {
|
||||
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
|
||||
"handler object", NULL},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyMethodDef parser_methods[] = {
|
||||
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
|
||||
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
|
||||
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
|
||||
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
|
||||
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
|
||||
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
|
||||
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
|
||||
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
|
||||
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
||||
static PyTypeObject parser_type = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, /* ob_size */
|
||||
"bk.HtmlParser.htmlsax.parser", /* tp_name */
|
||||
sizeof(parser_object), /* tp_size */
|
||||
0, /* tp_itemsize */
|
||||
/* methods */
|
||||
(destructor)parser_dealloc, /* tp_dealloc */
|
||||
0, /* tp_print */
|
||||
0, /* tp_getattr */
|
||||
0, /* tp_setattr */
|
||||
0, /* tp_compare */
|
||||
0, /* tp_repr */
|
||||
0, /* tp_as_number */
|
||||
0, /* tp_as_sequence */
|
||||
0, /* tp_as_mapping */
|
||||
0, /* tp_hash */
|
||||
0, /* tp_call */
|
||||
0, /* tp_str */
|
||||
0, /* tp_getattro */
|
||||
0, /* tp_setattro */
|
||||
0, /* tp_as_buffer */
|
||||
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
|
||||
Py_TPFLAGS_HAVE_GC, /* tp_flags */
|
||||
"HTML parser object", /* tp_doc */
|
||||
(traverseproc)parser_traverse, /* tp_traverse */
|
||||
(inquiry)parser_clear, /* tp_clear */
|
||||
0, /* tp_richcompare */
|
||||
0, /* tp_weaklistoffset */
|
||||
0, /* tp_iter */
|
||||
0, /* tp_iternext */
|
||||
parser_methods, /* tp_methods */
|
||||
parser_members, /* tp_members */
|
||||
parser_getset, /* tp_getset */
|
||||
0, /* tp_base */
|
||||
0, /* tp_dict */
|
||||
0, /* tp_descr_get */
|
||||
0, /* tp_descr_set */
|
||||
0, /* tp_dictoffset */
|
||||
(initproc)parser_init, /* tp_init */
|
||||
0, /* tp_alloc */
|
||||
parser_new, /* tp_new */
|
||||
};
|
||||
|
||||
|
||||
/* python module interface
|
||||
"Create a new HTML parser object with handler (which may be None).\n"
|
||||
"\n"
|
||||
"Used callbacks (they don't have to be defined) of a handler are:\n"
|
||||
"comment(data): <!--data-->\n"
|
||||
"startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
|
||||
"endElement(tag): </tag>\n"
|
||||
"doctype(data): <!DOCTYPE data?>\n"
|
||||
"pi(name, data=None): <?name data?>\n"
|
||||
"cdata(data): <![CDATA[data]]>\n"
|
||||
"characters(data): data\n"
|
||||
"\n"
|
||||
"Additionally, there are error and warning callbacks:\n"
|
||||
"error(msg)\n"
|
||||
"warning(msg)\n"
|
||||
"fatalError(msg)\n"},
|
||||
|
||||
*/
|
||||
|
||||
static PyMethodDef htmlsax_methods[] = {
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
||||
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
|
||||
#define PyMODINIT_FUNC void
|
||||
#endif
|
||||
/* initialization of the htmlsax module */
|
||||
PyMODINIT_FUNC inithtmlsax (void) {
|
||||
PyObject* m;
|
||||
if (PyType_Ready(&parser_type) < 0) {
|
||||
return;
|
||||
}
|
||||
if ((m = Py_InitModule3("htmlsax", htmlsax_methods, "SAX HTML parser routines"))==NULL) {
|
||||
return;
|
||||
}
|
||||
Py_INCREF(&parser_type);
|
||||
if (PyModule_AddObject(m, "parser", (PyObject *)&parser_type)==-1) {
|
||||
/* init error */
|
||||
PyErr_Print();
|
||||
}
|
||||
if ((m = PyImport_ImportModule("bk.HtmlParser"))==NULL) {
|
||||
return;
|
||||
}
|
||||
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
|
||||
return;
|
||||
}
|
||||
if ((m = PyImport_ImportModule("bk.containers"))==NULL) {
|
||||
return;
|
||||
}
|
||||
if ((list_dict = PyObject_GetAttrString(m, "ListDict"))==NULL) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
83
bk/HtmlParser/htmlsax.h
Normal file
83
bk/HtmlParser/htmlsax.h
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
/* Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
#ifndef HTMLSAX_H
|
||||
#define HTMLSAX_H
|
||||
|
||||
#include "Python.h"
|
||||
|
||||
/* require Python >= 2.3 */
|
||||
#ifndef PY_VERSION_HEX
|
||||
#error please install Python >= 2.3
|
||||
#endif
|
||||
|
||||
#if PY_VERSION_HEX < 0x02030000
|
||||
#error please install Python >= 2.3
|
||||
#endif
|
||||
|
||||
/* this will be in Python 2.4 */
|
||||
#ifndef Py_RETURN_NONE
|
||||
#define Py_RETURN_NONE do {Py_INCREF(Py_None); return Py_None;} while (0)
|
||||
#endif
|
||||
|
||||
/* user_data type for SAX calls */
|
||||
typedef struct {
|
||||
/* the Python SAX object to issue callbacks */
|
||||
PyObject* handler;
|
||||
/* Buffer to store still-to-be-scanned characters. After recognizing
|
||||
* a complete syntax element, all data up to bufpos will be removed.
|
||||
* Before scanning you should append new data to this buffer.
|
||||
*/
|
||||
char* buf;
|
||||
/* current position in the buffer counting from zero */
|
||||
unsigned int bufpos;
|
||||
/* current position of next syntax element */
|
||||
unsigned int nextpos;
|
||||
/* position in the stream of data already seen, counting from zero */
|
||||
unsigned int pos;
|
||||
/* line counter, counting from one */
|
||||
unsigned int lineno;
|
||||
/* last value of line counter */
|
||||
unsigned int last_lineno;
|
||||
/* column counter, counting from zero */
|
||||
unsigned int column;
|
||||
/* last value of column counter */
|
||||
unsigned int last_column;
|
||||
/* input buffer of lexer, must be deleted when the parsing stops */
|
||||
void* lexbuf;
|
||||
/* temporary character buffer */
|
||||
char* tmp_buf;
|
||||
/* temporary HTML start or end tag name */
|
||||
PyObject* tmp_tag;
|
||||
/* temporary HTML start tag attribute name */
|
||||
PyObject* tmp_attrname;
|
||||
/* temporary HTML start tag attribute value */
|
||||
PyObject* tmp_attrval;
|
||||
/* temporary HTML start tag attribute list (a SortedDict) */
|
||||
PyObject* tmp_attrs;
|
||||
/* parser.resolve_entities */
|
||||
PyObject* resolve_entities;
|
||||
/* parser.SortedDict */
|
||||
PyObject* list_dict;
|
||||
/* stored Python exception (if error occurred in scanner) */
|
||||
PyObject* exc_type;
|
||||
PyObject* exc_val;
|
||||
PyObject* exc_tb;
|
||||
/* error string */
|
||||
PyObject* error;
|
||||
} UserData;
|
||||
|
||||
#endif
|
||||
54
bk/HtmlParser/s_util.c
Normal file
54
bk/HtmlParser/s_util.c
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* linux/lib/string.c
|
||||
*
|
||||
* Copyright (C) 1991, 1992 Linus Torvalds
|
||||
*/
|
||||
#include <string.h>
|
||||
|
||||
#if !defined(HAVE_STRLCPY)
|
||||
/**
|
||||
* strlcpy - Copy a %NUL terminated string into a sized buffer
|
||||
* @dst: Where to copy the string to
|
||||
* @src: Where to copy the string from
|
||||
* @size: size of destination buffer
|
||||
*
|
||||
* Compatible with *BSD: the result is always a valid
|
||||
* NUL-terminated string that fits in the buffer (unless,
|
||||
* of course, the buffer size is zero). It does not pad
|
||||
* out the result like strncpy() does.
|
||||
*/
|
||||
size_t strlcpy (char *dst, const char *src, size_t count)
|
||||
{
|
||||
size_t ret = strlen(src);
|
||||
|
||||
if (count) {
|
||||
size_t len = (ret >= count) ? count-1 : ret;
|
||||
memcpy(dst, src, len);
|
||||
dst[len] = '\0';
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif /* !HAVE_STRLCPY */
|
||||
|
||||
#if !defined(HAVE_STRLCAT)
|
||||
/**
|
||||
* strlcat - Append a length-limited, %NUL-terminated string to another
|
||||
* @dst: The string to be appended to
|
||||
* @src: The string to append to it
|
||||
* @size: The size of the destination buffer.
|
||||
*/
|
||||
size_t strlcat (char *dest, const char *src, size_t count)
|
||||
{
|
||||
size_t dsize = strlen(dest);
|
||||
size_t len = strlen(src);
|
||||
size_t res = dsize + len;
|
||||
dest += dsize;
|
||||
count -= dsize;
|
||||
if (len >= count)
|
||||
len = count-1;
|
||||
memcpy(dest, src, len);
|
||||
dest[len] = 0;
|
||||
return res;
|
||||
}
|
||||
#endif /* !HAVE_STRLCAT */
|
||||
14
bk/HtmlParser/s_util.h
Normal file
14
bk/HtmlParser/s_util.h
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* linux/lib/string.c
|
||||
*
|
||||
* Copyright (C) 1991, 1992 Linus Torvalds
|
||||
*/
|
||||
|
||||
|
||||
#if !defined(HAVE_STRLCPY)
|
||||
size_t strlcpy(char *dst, const char *src, size_t size);
|
||||
#endif /* !HAVE_STRLCPY */
|
||||
|
||||
#if !defined(HAVE_STRLCAT)
|
||||
size_t strlcat(char *dst, const char *src, size_t size);
|
||||
#endif /* !HAVE_STRLCAT */
|
||||
7
bk/Makefile
Normal file
7
bk/Makefile
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
D = ../../bk-python/bk
|
||||
|
||||
diff:
|
||||
diff -BurN . $(D)
|
||||
|
||||
update:
|
||||
cp -r $(D)/* .
|
||||
1
bk/__init__.py
Normal file
1
bk/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
79
bk/ansicolor.py
Normal file
79
bk/ansicolor.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""ANSI Color definitions and functions"""
|
||||
# Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
import os
|
||||
|
||||
# Escape for ANSI colors
|
||||
AnsiEsc="\x1b[%sm"
|
||||
|
||||
# type numbers
|
||||
AnsiType = {
|
||||
'bold': '1',
|
||||
'light': '2',
|
||||
'underline': '4',
|
||||
'blink': '5',
|
||||
'invert': '7',
|
||||
'concealed': '8',
|
||||
}
|
||||
|
||||
# color numbers
|
||||
AnsiColor = {
|
||||
'default': '0',
|
||||
'black': '30',
|
||||
'red': '31',
|
||||
'green': '32',
|
||||
'yellow': '33',
|
||||
'blue': '34',
|
||||
'purple': '35',
|
||||
'cyan': '36',
|
||||
'white': '37',
|
||||
'Black': '40',
|
||||
'Red': '41',
|
||||
'Green': '42',
|
||||
'Yellow': '43',
|
||||
'Blue': '44',
|
||||
'Purple': '45',
|
||||
'Cyan': '46',
|
||||
'White': '47',
|
||||
|
||||
}
|
||||
|
||||
|
||||
# pc speaker beep escape code
|
||||
Beep = "\007"
|
||||
|
||||
|
||||
def esc_ansicolor (color):
|
||||
"""convert a named color definition to an escaped ANSI color"""
|
||||
ctype = ''
|
||||
if ";" in color:
|
||||
ctype, color = color.split(";", 1)
|
||||
ctype = AnsiType.get(ctype, '')+";"
|
||||
cnum = AnsiColor.get(color, '0')
|
||||
return AnsiEsc % (ctype+cnum)
|
||||
|
||||
AnsiReset = esc_ansicolor("default")
|
||||
|
||||
|
||||
def colorize (text, color=None):
|
||||
"""return text colorized if TERM is set"""
|
||||
if (color is not None) and os.environ.get('TERM'):
|
||||
color = esc_ansicolor(color)
|
||||
return '%s%s%s' % (color, text, AnsiReset)
|
||||
else:
|
||||
return text
|
||||
218
bk/containers.py
Normal file
218
bk/containers.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""special container classes"""
|
||||
# Copyright (C) 2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
|
||||
class SetList (list):
|
||||
"""a list that eliminates all duplicates
|
||||
"""
|
||||
|
||||
def append (self, x):
|
||||
"""append only if not already there"""
|
||||
if x not in self:
|
||||
super(SetList, self).append(x)
|
||||
|
||||
def extend (self, x):
|
||||
"""extend while eliminating duplicates by appending item for item"""
|
||||
for i in x:
|
||||
self.append(i)
|
||||
|
||||
def insert (self, i, x):
|
||||
"""insert only if not already there"""
|
||||
if x not in self:
|
||||
super(SetList, self).insert(i, x)
|
||||
|
||||
def __setitem__ (self, key, value):
|
||||
"""set new value, and eliminate old duplicates (if any)"""
|
||||
oldvalues = []
|
||||
for i in range(len(self)):
|
||||
if self[i]==value:
|
||||
oldvalues.append(i)
|
||||
super(SetList, self).__setitem__(key, value)
|
||||
# remove old duplicates (from last to first)
|
||||
oldvalues.reverse()
|
||||
for i in oldvalues:
|
||||
if i!=key:
|
||||
del self[key]
|
||||
|
||||
|
||||
class ListDict (dict):
|
||||
"""a dictionary whose iterators reflect the order in which elements
|
||||
were added
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""initialize sorted key list"""
|
||||
# sorted list of keys
|
||||
self._keys = []
|
||||
|
||||
def __setitem__ (self, key, value):
|
||||
"""add key,value to dict, append key to sorted list"""
|
||||
if not self.has_key(key):
|
||||
self._keys.append(key)
|
||||
super(ListDict, self).__setitem__(key, value)
|
||||
|
||||
def __delitem__ (self, key):
|
||||
"""remove key from dict"""
|
||||
self._keys.remove(key)
|
||||
super(ListDict, self).__delitem__(key)
|
||||
|
||||
def values (self):
|
||||
"""return sorted list of values"""
|
||||
return [self[k] for k in self._keys]
|
||||
|
||||
def items (self):
|
||||
"""return sorted list of items"""
|
||||
return [(k, self[k]) for k in self._keys]
|
||||
|
||||
def keys (self):
|
||||
"""return sorted list of keys"""
|
||||
return self._keys[:]
|
||||
|
||||
def itervalues (self):
|
||||
"""return iterator over sorted values"""
|
||||
return iter(self.values())
|
||||
|
||||
def iteritems (self):
|
||||
"""return iterator over sorted items"""
|
||||
return iter(self.items())
|
||||
|
||||
def iterkeys (self):
|
||||
"""return iterator over sorted keys"""
|
||||
return iter(self.keys())
|
||||
|
||||
def clear (self):
|
||||
"""remove all dict entires"""
|
||||
self._keys = []
|
||||
super(ListDict, self).clear()
|
||||
|
||||
|
||||
class LRU (object):
|
||||
"""
|
||||
Implementation of a length-limited O(1) LRU queue.
|
||||
Built for and used by PyPE:
|
||||
http://pype.sourceforge.net
|
||||
Copyright 2003 Josiah Carlson. (Licensed under the GPL)
|
||||
"""
|
||||
class Node (object):
|
||||
def __init__ (self, prev, me):
|
||||
self.prev = prev
|
||||
self.me = me
|
||||
self.next = None
|
||||
|
||||
def __init__ (self, count, pairs=[]):
|
||||
self.count = max(count, 1)
|
||||
self.d = {}
|
||||
self.first = None
|
||||
self.last = None
|
||||
for key, value in pairs:
|
||||
self[key] = value
|
||||
|
||||
def __contains__ (self, obj):
|
||||
return obj in self.d
|
||||
|
||||
def has_key (self, obj):
|
||||
return self.d.has_key(obj)
|
||||
|
||||
def __getitem__ (self, obj):
|
||||
a = self.d[obj].me
|
||||
self[a[0]] = a[1]
|
||||
return a[1]
|
||||
|
||||
def __setitem__ (self, obj, val):
|
||||
if obj in self.d:
|
||||
del self[obj]
|
||||
nobj = self.Node(self.last, (obj, val))
|
||||
if self.first is None:
|
||||
self.first = nobj
|
||||
if self.last:
|
||||
self.last.next = nobj
|
||||
self.last = nobj
|
||||
self.d[obj] = nobj
|
||||
if len(self.d) > self.count:
|
||||
if self.first == self.last:
|
||||
self.first = None
|
||||
self.last = None
|
||||
return
|
||||
a = self.first
|
||||
a.next.prev = None
|
||||
self.first = a.next
|
||||
a.next = None
|
||||
del self.d[a.me[0]]
|
||||
del a
|
||||
|
||||
def __delitem__ (self, obj):
|
||||
nobj = self.d[obj]
|
||||
if nobj.prev:
|
||||
nobj.prev.next = nobj.next
|
||||
else:
|
||||
self.first = nobj.next
|
||||
if nobj.next:
|
||||
nobj.next.prev = nobj.prev
|
||||
else:
|
||||
self.last = nobj.prev
|
||||
del self.d[obj]
|
||||
|
||||
def __iter__ (self):
|
||||
cur = self.first
|
||||
while cur != None:
|
||||
cur2 = cur.next
|
||||
yield cur.me[1]
|
||||
cur = cur2
|
||||
|
||||
def iteritems (self):
|
||||
cur = self.first
|
||||
while cur != None:
|
||||
cur2 = cur.next
|
||||
yield cur.me
|
||||
cur = cur2
|
||||
|
||||
def iterkeys (self):
|
||||
return iter(self.d)
|
||||
|
||||
def itervalues (self):
|
||||
for i,j in self.iteritems():
|
||||
yield j
|
||||
|
||||
def keys (self):
|
||||
return self.d.keys()
|
||||
|
||||
def setdefault (self, key, failobj=None):
|
||||
if not self.has_key(key):
|
||||
self[key] = failobj
|
||||
return self[key]
|
||||
|
||||
|
||||
def _main ():
|
||||
a = LRU(4)
|
||||
a['1'] = '1'
|
||||
a['2'] = '2'
|
||||
a['3'] = '3'
|
||||
a['4'] = '4'
|
||||
a['5'] = '5'
|
||||
for i in a.iteritems():
|
||||
print i,
|
||||
print
|
||||
print a['2']
|
||||
a['6'] = '6'
|
||||
for i in a.iteritems():
|
||||
print i,
|
||||
print
|
||||
print a.has_key('1')
|
||||
print a.has_key('2')
|
||||
|
||||
|
||||
109
bk/i18n.py
Normal file
109
bk/i18n.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""application internationalization support"""
|
||||
# Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
# i18n suppport
|
||||
import os
|
||||
import locale
|
||||
import gettext
|
||||
|
||||
# default gettext function
|
||||
_ = lambda s: s
|
||||
|
||||
# more supported languages are added in init_gettext
|
||||
supported_languages = ['en']
|
||||
default_language = None
|
||||
|
||||
def init (domain, directory):
|
||||
"""initialize this gettext i18n module"""
|
||||
global _, default_language
|
||||
try:
|
||||
_ = gettext.translation(domain, directory).gettext
|
||||
except IOError:
|
||||
# keep default gettext function
|
||||
pass
|
||||
# get supported languages
|
||||
for lang in os.listdir(directory):
|
||||
path = os.path.join(directory, lang)
|
||||
if not os.path.isdir(path):
|
||||
continue
|
||||
if os.path.exists(os.path.join(path, 'LC_MESSAGES', '%s.mo'%domain)):
|
||||
supported_languages.append(lang)
|
||||
loc = get_locale()
|
||||
if loc in supported_languages:
|
||||
default_language = loc
|
||||
else:
|
||||
default_language = "en"
|
||||
|
||||
|
||||
def get_lang (lang):
|
||||
"""return lang if it is supported, or the default language"""
|
||||
if lang in supported_languages:
|
||||
return lang
|
||||
return default_language
|
||||
|
||||
|
||||
def get_headers_lang (headers):
|
||||
"""return preferred supported language in given HTTP headers"""
|
||||
if not headers.has_key('Accept-Language'):
|
||||
return default_language
|
||||
languages = headers['Accept-Language'].split(",")
|
||||
# XXX sort with quality values
|
||||
languages = [ lang.split(";")[0].strip() for lang in languages ]
|
||||
for lang in languages:
|
||||
if lang in supported_languages:
|
||||
return lang
|
||||
return default_language
|
||||
|
||||
|
||||
def get_locale ():
|
||||
"""return current configured locale"""
|
||||
loc = locale.getdefaultlocale()[0]
|
||||
if loc is None:
|
||||
loc = 'C'
|
||||
loc = locale.normalize(loc)
|
||||
# split up the locale into its base components
|
||||
pos = loc.find('@')
|
||||
if pos >= 0:
|
||||
loc = loc[:pos]
|
||||
pos = loc.find('.')
|
||||
if pos >= 0:
|
||||
loc = loc[:pos]
|
||||
pos = loc.find('_')
|
||||
if pos >= 0:
|
||||
loc = loc[:pos]
|
||||
return loc
|
||||
|
||||
|
||||
lang_names = {
|
||||
'en': u'English',
|
||||
'de': u'Deutsch',
|
||||
}
|
||||
lang_transis = {
|
||||
'de': {'en': u'German'},
|
||||
'en': {'de': u'Englisch'},
|
||||
}
|
||||
|
||||
def lang_name (lang):
|
||||
"""return full name of given language"""
|
||||
return lang_names[lang]
|
||||
|
||||
|
||||
def lang_trans (lang, curlang):
|
||||
"""return translated full name of given language"""
|
||||
return lang_transis[lang][curlang]
|
||||
|
||||
114
bk/log.py
Normal file
114
bk/log.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""logging and debug functions"""
|
||||
# Copyright (C) 2003-2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
# public api
|
||||
__all__ = ["debug", "info", "warn", "error", "critical",
|
||||
"exception", "get_log_file", "set_format", "usedmemory"]
|
||||
|
||||
import os
|
||||
import logging
|
||||
|
||||
|
||||
def iswritable (fname):
|
||||
"""return True if given file is writable"""
|
||||
if os.path.isdir(fname) or os.path.islink(fname):
|
||||
return False
|
||||
try:
|
||||
if os.path.exists(fname):
|
||||
file(fname, 'a').close()
|
||||
return True
|
||||
else:
|
||||
file(fname, 'w').close()
|
||||
os.remove(fname)
|
||||
return True
|
||||
except IOError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def get_log_file (name, logname, trydirs=[]):
|
||||
"""get full path name to writeable logfile"""
|
||||
dirs = []
|
||||
if os.name =='nt':
|
||||
dirs.append(os.environ.get("TEMP"))
|
||||
else:
|
||||
dirs.append(os.path.join('/', 'var', 'log', name))
|
||||
dirs.append(os.path.join('/', 'var', 'tmp', name))
|
||||
dirs.append(os.path.join('/', 'tmp', name))
|
||||
dirs.append(os.getcwd())
|
||||
trydirs = trydirs+dirs
|
||||
for d in trydirs:
|
||||
fullname = os.path.join(d, logname)
|
||||
if iswritable(fullname):
|
||||
return fullname
|
||||
raise IOError("Could not find writable directory for %s in %s" % (logname, str(trydirs)))
|
||||
|
||||
|
||||
def set_format (handler):
|
||||
"""set standard format for handler"""
|
||||
handler.setFormatter(logging.root.handlers[0].formatter)
|
||||
return handler
|
||||
|
||||
|
||||
def usedmemory ():
|
||||
"""return string with used memory"""
|
||||
pid = os.getpid()
|
||||
fp = file('/proc/%d/status'%pid)
|
||||
val = 0
|
||||
try:
|
||||
for line in fp.readlines():
|
||||
if line.startswith('VmRSS:'):
|
||||
val = int(line[6:].strip().split()[0])
|
||||
finally:
|
||||
fp.close()
|
||||
return val
|
||||
|
||||
|
||||
import gc
|
||||
gc.enable()
|
||||
# memory leak debugging
|
||||
#gc.set_debug(gc.DEBUG_LEAK)
|
||||
def debug (log, msg, *args):
|
||||
"""log a debug message"""
|
||||
logging.getLogger(log).debug(msg, *args)
|
||||
#logging.getLogger(log).info("Mem: %d kB"%usedmemory())
|
||||
|
||||
|
||||
def info (log, msg, *args):
|
||||
"""log an informational message"""
|
||||
logging.getLogger(log).info(msg, *args)
|
||||
|
||||
|
||||
def warn (log, msg, *args):
|
||||
"""log a warning"""
|
||||
logging.getLogger(log).warn(msg, *args)
|
||||
|
||||
|
||||
def error (log, msg, *args):
|
||||
"""log an error"""
|
||||
logging.getLogger(log).error(msg, *args)
|
||||
|
||||
|
||||
def critical (log, msg, *args):
|
||||
"""log a critical error"""
|
||||
logging.getLogger(log).critical(msg, *args)
|
||||
|
||||
|
||||
def exception (log, msg, *args):
|
||||
"""log an exception"""
|
||||
logging.getLogger(log).exception(msg, *args)
|
||||
55
bk/mem.py
Normal file
55
bk/mem.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
""" Copied from the Python Cookbook recipe at
|
||||
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222
|
||||
|
||||
To find the memory usage in a particular section of code these
|
||||
functions are typically used as follows:
|
||||
|
||||
m0 = memory()
|
||||
...
|
||||
m1 = memory(m0)
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
_proc_status = '/proc/%d/status' % os.getpid()
|
||||
|
||||
_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
|
||||
'KB': 1024.0, 'MB': 1024.0*1024.0}
|
||||
|
||||
def _VmB (VmKey):
|
||||
'''Parse /proc/<pid>/status file for given key.'''
|
||||
if os.name != 'posix':
|
||||
# not supported
|
||||
return 0.0
|
||||
global _proc_status, _scale
|
||||
# get pseudo file /proc/<pid>/status
|
||||
try:
|
||||
t = open(_proc_status)
|
||||
v = t.read()
|
||||
t.close()
|
||||
except IOError:
|
||||
# unsupported platform (non-Linux?)
|
||||
return 0.0
|
||||
# get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
|
||||
i = v.index(VmKey)
|
||||
v = v[i:].split(None, 3) # whitespace
|
||||
if len(v) < 3:
|
||||
return 0.0 # invalid format?
|
||||
# convert Vm value to bytes
|
||||
return float(v[1]) * _scale[v[2]]
|
||||
|
||||
|
||||
def memory (since=0.0):
|
||||
'''Return memory usage in bytes.'''
|
||||
return _VmB('VmSize:') - since
|
||||
|
||||
|
||||
def resident (since=0.0):
|
||||
'''Return resident memory usage in bytes.'''
|
||||
return _VmB('VmRSS:') - since
|
||||
|
||||
|
||||
def stacksize (since=0.0):
|
||||
'''Return stack size in bytes.'''
|
||||
return _VmB('VmStk:') - since
|
||||
48
bk/strtime.py
Normal file
48
bk/strtime.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""time to string conversion utility functions"""
|
||||
# Copyright (C) 2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
import time
|
||||
import bk.i18n
|
||||
|
||||
|
||||
def strtime (t):
|
||||
"""return ISO 8601 formatted time"""
|
||||
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
|
||||
strtimezone()
|
||||
|
||||
|
||||
def strduration (duration):
|
||||
"""return string formatted time duration"""
|
||||
name = bk.i18n._("seconds")
|
||||
if duration > 60:
|
||||
duration = duration / 60
|
||||
name = bk.i18n._("minutes")
|
||||
if duration > 60:
|
||||
duration = duration / 60
|
||||
name = bk.i18n._("hours")
|
||||
return " %.3f %s"%(duration, name)
|
||||
|
||||
|
||||
def strtimezone ():
|
||||
"""return timezone info, %z on some platforms, but not supported on all"""
|
||||
if time.daylight:
|
||||
zone = time.altzone
|
||||
else:
|
||||
zone = time.timezone
|
||||
return "%+04d" % int(-zone/3600)
|
||||
|
||||
1
bk/tests/__init__.py
Normal file
1
bk/tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
87
bk/tests/test_containers.py
Normal file
87
bk/tests/test_containers.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""test container routines"""
|
||||
|
||||
import unittest
|
||||
import random
|
||||
import bk.containers
|
||||
|
||||
|
||||
class TestListDict (unittest.TestCase):
|
||||
|
||||
def setUp (self):
|
||||
self.d = bk.containers.ListDict()
|
||||
|
||||
def test_insert (self):
|
||||
self.assert_(not self.d)
|
||||
self.d[2] = 1
|
||||
self.d[1] = 2
|
||||
self.assert_(2 in self.d)
|
||||
self.assert_(1 in self.d)
|
||||
|
||||
def test_delete (self):
|
||||
self.assert_(not self.d)
|
||||
self.d[2] = 1
|
||||
self.d[1] = 2
|
||||
del self.d[1]
|
||||
self.assert_(2 in self.d)
|
||||
self.assert_(1 not in self.d)
|
||||
|
||||
def test_update (self):
|
||||
self.assert_(not self.d)
|
||||
self.d[2] = 1
|
||||
self.d[1] = 2
|
||||
self.d[1] = 1
|
||||
self.assertEqual(self.d[1], 1)
|
||||
|
||||
def test_sorting (self):
|
||||
self.assert_(not self.d)
|
||||
toinsert = random.sample(xrange(10000000), 60)
|
||||
for x in toinsert:
|
||||
self.d[x] = x
|
||||
for i, k in enumerate(self.d.keys()):
|
||||
self.assertEqual(self.d[k], toinsert[i])
|
||||
|
||||
|
||||
class TestSetList (unittest.TestCase):
|
||||
|
||||
def setUp (self):
|
||||
self.l = bk.containers.SetList()
|
||||
|
||||
def test_append (self):
|
||||
self.assert_(not self.l)
|
||||
self.l.append(1)
|
||||
self.l.append(1)
|
||||
self.assertEqual(len(self.l), 1)
|
||||
|
||||
def test_append2 (self):
|
||||
self.assert_(not self.l)
|
||||
self.l.append(1)
|
||||
self.l.append(2)
|
||||
self.l.append(1)
|
||||
self.assertEqual(len(self.l), 2)
|
||||
|
||||
def test_extend (self):
|
||||
self.assert_(not self.l)
|
||||
self.l.extend([1, 2, 1])
|
||||
self.assertEqual(len(self.l), 2)
|
||||
self.assertEqual(self.l[0], 1)
|
||||
self.assertEqual(self.l[1], 2)
|
||||
|
||||
def test_setitem (self):
|
||||
self.assert_(not self.l)
|
||||
self.l.extend([1,2,3])
|
||||
self.l[1] = 3
|
||||
self.assertEqual(len(self.l), 2)
|
||||
self.assertEqual(self.l[0], 1)
|
||||
self.assertEqual(self.l[1], 3)
|
||||
|
||||
|
||||
def test_suite ():
|
||||
suite = unittest.TestSuite()
|
||||
suite.addTest(unittest.makeSuite(TestListDict))
|
||||
suite.addTest(unittest.makeSuite(TestSetList))
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
170
bk/tests/test_parser.py
Normal file
170
bk/tests/test_parser.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
|
||||
import bk.HtmlParser
|
||||
import bk.HtmlParser.htmlsax
|
||||
import bk.HtmlParser.htmllib
|
||||
import cStringIO as StringIO
|
||||
import unittest
|
||||
|
||||
|
||||
parsetests = [
|
||||
# start tags
|
||||
("""<a b="c" >""", """<a b="c">"""),
|
||||
("""<a b='c' >""", """<a b="c">"""),
|
||||
("""<a b=c" >""", """<a b="c">"""),
|
||||
("""<a b=c' >""", """<a b="c'">"""),
|
||||
("""<a b="c >""", """<a b="c >"""),
|
||||
("""<a b="" >""", """<a b="">"""),
|
||||
("""<a b='' >""", """<a b="">"""),
|
||||
("""<a b=>""", """<a b="">"""),
|
||||
("""<a b= >""", """<a b="">"""),
|
||||
("""<a =c>""", """<a c>"""),
|
||||
("""<a =c >""", """<a c>"""),
|
||||
("""<a =>""", """<a>"""),
|
||||
("""<a = >""", """<a>"""),
|
||||
("""<a b= "c" >""", """<a b="c">"""),
|
||||
("""<a b ="c" >""", """<a b="c">"""),
|
||||
("""<a b = "c" >""", """<a b="c">"""),
|
||||
("""<a >""", """<a>"""),
|
||||
("""< a>""", """<a>"""),
|
||||
("""< a >""", """<a>"""),
|
||||
("""<>""", """<>"""),
|
||||
("""< >""", """< >"""),
|
||||
# reduce test
|
||||
("""<a b="c"><""", """<a b="c"><"""),
|
||||
("""d>""", """d>"""),
|
||||
# numbers in tag
|
||||
("""<h1>bla</h1>""", """<h1>bla</h1>"""),
|
||||
# more start tags
|
||||
("""<a b=c"><a b="c">""", """<a b="c"><a b="c">"""),
|
||||
("""<a b="c><a b="c">""", """<a b="c><a b=" c>"""),
|
||||
("""<a b=/c/></a><br>""", """<a b="/c/"></a><br>"""),
|
||||
("""<br/>""", """<br>"""),
|
||||
("""<a b="50%"><br>""", """<a b="50%"><br>"""),
|
||||
# comments
|
||||
("""<!---->""", """<!---->"""),
|
||||
("""<!-- a - b -->< br>""", """<!-- a - b --><br>"""),
|
||||
("""<!----->""", """<!----->"""),
|
||||
("""<!------>""", """<!------>"""),
|
||||
("""<!------->""", """<!------->"""),
|
||||
("""<!---- >""", """<!----->"""),
|
||||
("""<!-- -->""", """<!-- -->"""),
|
||||
("""<!-- -- >""", """<!-- --->"""),
|
||||
("""<!---- />-->""", """<!---- />-->"""),
|
||||
# end tags
|
||||
("""</a>""", """</a>"""),
|
||||
("""</ a>""", """</a>"""),
|
||||
("""</ a >""", """</a>"""),
|
||||
("""</a >""", """</a>"""),
|
||||
("""< / a>""", """</a>"""),
|
||||
("""< /a>""", """</a>"""),
|
||||
# missing > in end tag
|
||||
("""</td <td a="b" >""", """</td><td a="b">"""),
|
||||
# start and end tag
|
||||
("""<a/>""", """<a></a>"""),
|
||||
# declaration tags
|
||||
("""<!DOCtype adrbook SYSTEM "adrbook.dtd">""", """<!DOCTYPE adrbook SYSTEM "adrbook.dtd">"""),
|
||||
# misc
|
||||
("""<?xmL version="1.0" encoding="latin1"?>""", """<?xmL version="1.0" encoding="latin1"?>"""),
|
||||
# javascript
|
||||
("""<script >\n</script>""", """<script>\n</script>"""),
|
||||
("""<sCrIpt lang="a">bla </a> fasel</scripT>""", """<script lang="a">bla </a> fasel</script>"""),
|
||||
# line continuation (Dr. Fun webpage)
|
||||
("<img bo\\\nrder=0 >", """<img bo rder="0">"""),
|
||||
# href with $
|
||||
("""<a href="123$456">""", """<a href="123$456">"""),
|
||||
# quoting
|
||||
("""<a href=/ >""", """<a href="/">"""),
|
||||
("""<a href= />""", """<a href="/">"""),
|
||||
("""<a href= >""", """<a href="">"""),
|
||||
("""<a href="'" >""", """<a href="'">"""),
|
||||
("""<a href='"' >""", """<a href=""">"""),
|
||||
("""<a href="bla" %]" >""", """<a href="bla">"""),
|
||||
("""<a href=bla" >""", """<a href="bla">"""),
|
||||
("""<a onmouseover=MM_swapImage('nav1','','/images/dwnavpoint_over.gif',1);movein(this); b="c">""",
|
||||
"""<a onmouseover="MM_swapImage('nav1','','/images/dwnavpoint_over.gif',1);movein(this);" b="c">"""),
|
||||
("""<a onClick=location.href('/index.htm') b="c">""",
|
||||
"""<a onclick="location.href('/index.htm')" b="c">"""),
|
||||
# entities
|
||||
("""<a href="mailto:" >""", """<a href="mailto:">"""),
|
||||
# non-ascii characters
|
||||
("""<Üzgür> fahr </langsamer> ¹²³¼½¬{""",
|
||||
"""<Üzgür> fahr </langsamer> ¹²³¼½¬{"""),
|
||||
]
|
||||
|
||||
flushtests = [
|
||||
("<", "<"),
|
||||
("<a", "<a"),
|
||||
("<!a", "<!a"),
|
||||
("<?a", "<?a"),
|
||||
]
|
||||
|
||||
|
||||
class TestParser (unittest.TestCase):
|
||||
|
||||
def setUp (self):
|
||||
# list of tuples (<test pattern>, <expected parse output>)
|
||||
self.htmlparser = bk.HtmlParser.htmlsax.parser()
|
||||
self.htmlparser2 = bk.HtmlParser.htmlsax.parser()
|
||||
|
||||
def test_parse (self):
|
||||
for _in, _out in parsetests:
|
||||
out = StringIO.StringIO()
|
||||
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
|
||||
self.htmlparser.feed(_in)
|
||||
self.htmlparser.flush()
|
||||
res = out.getvalue()
|
||||
self.assertEqual(res, _out)
|
||||
self.htmlparser.reset()
|
||||
|
||||
def test_feed (self):
|
||||
for _in, _out in parsetests:
|
||||
out = StringIO.StringIO()
|
||||
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.htmlparser.flush()
|
||||
res = out.getvalue()
|
||||
self.assertEqual(res, _out)
|
||||
self.htmlparser.reset()
|
||||
|
||||
def test_interwoven (self):
|
||||
for _in, _out in parsetests:
|
||||
out = StringIO.StringIO()
|
||||
out2 = StringIO.StringIO()
|
||||
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
|
||||
self.htmlparser2.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out2)
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.htmlparser2.feed(c)
|
||||
self.htmlparser.flush()
|
||||
self.htmlparser2.flush()
|
||||
res = out.getvalue()
|
||||
res2 = out2.getvalue()
|
||||
self.assertEqual(res, _out)
|
||||
self.assertEqual(res2, _out)
|
||||
self.htmlparser.reset()
|
||||
|
||||
def test_flush (self):
|
||||
for _in, _out in flushtests:
|
||||
out = StringIO.StringIO()
|
||||
self.htmlparser.handler = bk.HtmlParser.htmllib.HtmlPrettyPrinter(out)
|
||||
self.htmlparser.feed(_in)
|
||||
self.htmlparser.flush()
|
||||
res = out.getvalue()
|
||||
self.assertEqual(res, _out)
|
||||
self.htmlparser.reset()
|
||||
|
||||
def test_entities (self):
|
||||
for c in "abcdefghijklmnopqrstuvwxyz":
|
||||
self.assertEqual(bk.HtmlParser.resolve_entities("&#%d;"%ord(c)), c)
|
||||
|
||||
|
||||
def test_suite ():
|
||||
suite = unittest.TestSuite()
|
||||
suite.addTest(unittest.makeSuite(TestParser))
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Loading…
Reference in a new issue