new style parser object class

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1200 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-01-28 22:33:34 +00:00
parent 0361e3a156
commit 26072afd92
4 changed files with 577 additions and 381 deletions

View file

@ -16,53 +16,28 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
__version__ = "$Revision$"[11:-2]
__date__ = "$Date$"[7:-2]
import sys
try:
import htmlsax
except ImportError:
except ImportError, msg:
exctype, value = sys.exc_info()[:2]
print >>sys.stderr, "Could not import the parser module `htmlsax':", value
print >>sys.stderr, "Please check your installation of LinkChecker."
sys.exit(1)
class HtmlParser (object):
"""Use an internal C SAX parser. We do not define any callbacks
here for compatibility. Currently recognized callbacks are:
comment(data): <!--data-->
startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>
endElement(tag): </tag>
doctype(data): <!DOCTYPE data?>
pi(name, data=None): <?name data?>
cdata(data): <![CDATA[data]]>
characters(data): data
additionally, there are error and warning callbacks:
error(msg)
warning(msg)
fatalError(msg)
"""
def __init__ (self):
"""initialize the internal parser"""
self.parser = htmlsax.parser(self)
def __getattr__ (self, name):
"""delegate unknown attrs to self.parser"""
return getattr(self.parser, name)
class HtmlPrinter (HtmlParser):
"""handles all functions by printing the function name and
attributes"""
class HtmlPrinter (object):
"""handles all functions by printing the function name and attributes"""
def _print (self, *attrs):
print self.mem, attrs, self.last_lineno(), self.last_column()
print self.mem, attrs
def _errorfun (self, msg, name):
"""print msg to stderr with name prefix"""
pos = "%d:%d:" % (self.lineno(), self.column())
print >> sys.stderr, name, pos, msg
print >> sys.stderr, name, msg
def error (self, msg):
@ -81,14 +56,18 @@ class HtmlPrinter (HtmlParser):
def __getattr__ (self, name):
if hasattr(self.parser, name):
return getattr(self.parser, name)
"""remember the func name"""
self.mem = name
return self._print
def quote_attrval (val):
"""quote a HTML attribute to be able to wrap it in double quotes"""
return val.replace('"', '&quot;')
def _test():
p = HtmlPrinter()
p = htmlsax.parser(HtmlPrinter())
p.feed("<hTml>")
p.feed("<a href>")
p.feed("<a href=''>")
@ -111,10 +90,11 @@ def _test():
p.feed("<!DOCTYPE \"vla foo>")
p.flush()
def _broken ():
p = HtmlPrinter()
p.feed("<img bo\\\nrder=0>")
p = htmlsax.parser(HtmlPrinter())
# turn on debugging
p.debug(1)
p.feed("""<base href="http://www.msnbc.com/news/">""")
p.flush()

View file

@ -103,7 +103,9 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* Python module definition of a SAX html parser */
#include "htmlsax.h"
#include "structmember.h"
#include <string.h>
#include <stdio.h>
@ -151,6 +153,12 @@ static int yyerror (char* msg) {
if (b==NULL) return NULL; \
(b)[0] = '\0'
/* clear buffer b, returning NULL and decref self on error */
#define CLEAR_BUF_DECREF(self, b) \
b = PyMem_Resize(b, char, 1); \
if (b==NULL) { Py_DECREF(self); return NULL; } \
(b)[0] = '\0'
/* call error handler if error object is not NULL */
#define CHECK_ERROR(ud, label) \
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
@ -180,6 +188,7 @@ static int yyerror (char* msg) {
/* parser type definition */
typedef struct {
PyObject_HEAD
PyObject* handler;
UserData* userData;
void* scanner;
} parser_object;
@ -219,7 +228,7 @@ typedef int YYSTYPE;
/* Line 214 of yacc.c. */
#line 223 "htmlparse.c"
#line 232 "htmlparse.c"
#if ! defined (yyoverflow) || YYERROR_VERBOSE
@ -389,8 +398,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short yyrline[] =
{
0, 134, 134, 135, 138, 139, 146, 180, 226, 256,
276, 296, 316, 336, 357, 378
0, 143, 143, 144, 147, 148, 155, 189, 235, 265,
285, 305, 325, 345, 366, 387
};
#endif
@ -1095,22 +1104,22 @@ yyreduce:
switch (yyn)
{
case 2:
#line 134 "htmlparse.y"
#line 143 "htmlparse.y"
{;}
break;
case 3:
#line 135 "htmlparse.y"
#line 144 "htmlparse.y"
{;}
break;
case 4:
#line 138 "htmlparse.y"
#line 147 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
break;
case 5:
#line 140 "htmlparse.y"
#line 149 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1120,7 +1129,7 @@ yyreduce:
break;
case 6:
#line 147 "htmlparse.y"
#line 156 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
UserData* ud = yyget_extra(scanner);
@ -1157,7 +1166,7 @@ finish_start:
break;
case 7:
#line 181 "htmlparse.y"
#line 190 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
UserData* ud = yyget_extra(scanner);
@ -1206,7 +1215,7 @@ finish_start_end:
break;
case 8:
#line 227 "htmlparse.y"
#line 236 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1239,7 +1248,7 @@ finish_end:
break;
case 9:
#line 257 "htmlparse.y"
#line 266 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1262,7 +1271,7 @@ finish_comment:
break;
case 10:
#line 277 "htmlparse.y"
#line 286 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1285,7 +1294,7 @@ finish_pi:
break;
case 11:
#line 297 "htmlparse.y"
#line 306 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1308,7 +1317,7 @@ finish_cdata:
break;
case 12:
#line 317 "htmlparse.y"
#line 326 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1331,7 +1340,7 @@ finish_doctype:
break;
case 13:
#line 337 "htmlparse.y"
#line 346 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1355,7 +1364,7 @@ finish_script:
break;
case 14:
#line 358 "htmlparse.y"
#line 367 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1379,7 +1388,7 @@ finish_style:
break;
case 15:
#line 379 "htmlparse.y"
#line 388 "htmlparse.y"
{
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
@ -1406,7 +1415,7 @@ finish_characters:
}
/* Line 999 of yacc.c. */
#line 1410 "htmlparse.c"
#line 1419 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1600,7 +1609,7 @@ yyreturn:
}
#line 401 "htmlparse.y"
#line 410 "htmlparse.y"
/* disable python memory interface */
@ -1608,87 +1617,152 @@ yyreturn:
#undef realloc
#undef free
/* create parser */
static PyObject* htmlsax_parser_new(PyObject* self, PyObject* args) {
PyObject* handler;
parser_object* p;
if (!PyArg_ParseTuple(args, "O", &handler)) {
PyErr_SetString(PyExc_TypeError, "SAX2 handler object arg required");
return NULL;
/* create parser object */
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
parser_object* self;
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
{
return NULL;
}
p = PyObject_New(parser_object, &parser_type);
if (!p) {
PyErr_SetString(PyExc_TypeError, "Allocating parser object failed");
return NULL;
self->handler = NULL;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
if (self->userData == NULL)
{
Py_DECREF(self);
return NULL;
}
self->userData->handler = NULL;
self->userData->buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->buf);
self->userData->nextpos = 0;
self->userData->bufpos = 0;
self->userData->pos = 0;
self->userData->column = 1;
self->userData->last_column = 1;
self->userData->lineno = 1;
self->userData->last_lineno = 1;
self->userData->tmp_buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
self->userData->tmp_tag = self->userData->tmp_attrname =
self->userData->tmp_attrval = self->userData->tmp_attrs =
self->userData->lexbuf = NULL;
self->userData->exc_type = NULL;
self->userData->exc_val = NULL;
self->userData->exc_tb = NULL;
self->userData->error = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0)
{
Py_DECREF(self);
return NULL;
}
return (PyObject*) self;
}
/* initialize parser object */
static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
PyObject* handler;
if (!PyArg_ParseTuple(args, "O", &handler)) {
return -1;
}
Py_INCREF(handler);
/* reset userData */
p->userData = PyMem_New(UserData, sizeof(UserData));
p->userData->handler = handler;
p->userData->buf = NULL;
CLEAR_BUF(p->userData->buf);
p->userData->nextpos =
p->userData->bufpos =
p->userData->pos =
p->userData->pos = 0;
p->userData->column =
p->userData->last_column =
p->userData->lineno =
p->userData->last_lineno = 1;
p->userData->tmp_buf = NULL;
CLEAR_BUF(p->userData->tmp_buf);
p->userData->tmp_tag = p->userData->tmp_attrname =
p->userData->tmp_attrval = p->userData->tmp_attrs =
p->userData->lexbuf = NULL;
p->userData->exc_type = NULL;
p->userData->exc_val = NULL;
p->userData->exc_tb = NULL;
p->userData->error = NULL;
p->scanner = NULL;
htmllexInit(&(p->scanner), p->userData);
return (PyObject*) p;
self->handler = handler;
self->userData->handler = self->handler;
return 0;
}
static void parser_dealloc (PyObject* self) {
parser_object* p = (parser_object*)self;
htmllexDestroy(p->scanner);
Py_DECREF(p->userData->handler);
PyMem_Del(p->userData->buf);
PyMem_Del(p->userData->tmp_buf);
PyMem_Del(p->userData);
PyMem_DEL(p);
/* traverse all used subobjects participating in reference cycles */
static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
if (self->handler && visit(self->handler, arg) < 0) {
return -1;
}
return 0;
}
/* flush parser buffers, isueing any remaining data as character data */
static PyObject* parser_flush (PyObject* self, PyObject* args) {
/* clear all used subobjects participating in reference cycles */
static int parser_clear (parser_object* self) {
Py_XDECREF(self->handler);
self->handler = NULL;
self->userData->handler = NULL;
return 0;
}
/* free all allocated resources of parser object */
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
parser_clear(self);
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
self->ob_type->tp_free((PyObject*)self);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed (parser_object* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(self->scanner)!=0) {
if (self->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(self->userData->exc_type,
self->userData->exc_val,
self->userData->exc_tb);
}
htmllexStop(self->scanner, self->userData);
return NULL;
}
if (htmllexStop(self->scanner, self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
Py_RETURN_NONE;
}
/* flush all parser buffers */
static PyObject* parser_flush (parser_object* self, PyObject* args) {
int res = 0;
parser_object* p = (parser_object*)self;
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
/* reset parser variables */
CLEAR_BUF(p->userData->tmp_buf);
Py_XDECREF(p->userData->tmp_tag);
Py_XDECREF(p->userData->tmp_attrs);
Py_XDECREF(p->userData->tmp_attrval);
Py_XDECREF(p->userData->tmp_attrname);
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->userData->bufpos = 0;
if (strlen(p->userData->buf)) {
CLEAR_BUF(self->userData->tmp_buf);
Py_XDECREF(self->userData->tmp_tag);
Py_XDECREF(self->userData->tmp_attrs);
Py_XDECREF(self->userData->tmp_attrval);
Py_XDECREF(self->userData->tmp_attrname);
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->userData->bufpos = 0;
if (strlen(self->userData->buf)) {
// XXX set line, col
int error = 0;
PyObject* s = PyString_FromString(p->userData->buf);
PyObject* s = PyString_FromString(self->userData->buf);
PyObject* callback = NULL;
PyObject* result = NULL;
/* reset buffer */
CLEAR_BUF(p->userData->buf);
CLEAR_BUF(self->userData->buf);
if (s==NULL) { error=1; goto finish_flush; }
if (PyObject_HasAttrString(p->userData->handler, "characters")==1) {
callback = PyObject_GetAttrString(p->userData->handler, "characters");
if (PyObject_HasAttrString(self->handler, "characters")==1) {
callback = PyObject_GetAttrString(self->handler, "characters");
if (callback==NULL) { error=1; goto finish_flush; }
result = PyObject_CallFunction(callback, "O", s);
if (result==NULL) { error=1; goto finish_flush; }
@ -1696,195 +1770,224 @@ static PyObject* parser_flush (PyObject* self, PyObject* args) {
finish_flush:
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(s);
Py_XDECREF(s);
if (error==1) {
return NULL;
return NULL;
}
}
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
return Py_BuildValue("i", res);
}
/* return the current parser line number */
static PyObject* parser_lineno (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->lineno);
return Py_BuildValue("i", self->userData->lineno);
}
/* return the last parser line number */
static PyObject* parser_last_lineno (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->last_lineno);
return Py_BuildValue("i", self->userData->last_lineno);
}
/* return the current parser column number */
static PyObject* parser_column (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->column);
return Py_BuildValue("i", self->userData->column);
}
/* return the last parser column number */
static PyObject* parser_last_column (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_last_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->last_column);
return Py_BuildValue("i", self->userData->last_column);
}
static PyObject* parser_pos (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
/* return the parser position in data stream */
static PyObject* parser_pos (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->pos);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed(PyObject* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
parser_object* p = (parser_object*)self;
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(p->scanner, p->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(p->scanner)!=0) {
if (p->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(p->userData->exc_type,
p->userData->exc_val,
p->userData->exc_tb);
}
htmllexStop(p->scanner, p->userData);
return NULL;
}
if (htmllexStop(p->scanner, p->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
return Py_BuildValue("i", self->userData->pos);
}
/* reset the parser. This will erase all buffered data! */
static PyObject* parser_reset(PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_reset (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
if (htmllexDestroy(p->scanner)!=0) {
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
/* reset buffer */
CLEAR_BUF(p->userData->buf);
CLEAR_BUF(p->userData->tmp_buf);
p->userData->bufpos =
p->userData->pos =
p->userData->nextpos = 0;
p->userData->column =
p->userData->last_column =
p->userData->lineno =
p->userData->last_lineno = 1;
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->scanner = NULL;
if (htmllexInit(&(p->scanner), p->userData)!=0) {
CLEAR_BUF(self->userData->buf);
CLEAR_BUF(self->userData->tmp_buf);
self->userData->bufpos =
self->userData->pos =
self->userData->nextpos = 0;
self->userData->column =
self->userData->last_column =
self->userData->lineno =
self->userData->last_lineno = 1;
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
Py_RETURN_NONE;
}
/* set the debug level, if its >0, debugging is on, =0 means off */
static PyObject* parser_debug(PyObject* self, PyObject* args) {
static PyObject* parser_debug (parser_object* self, PyObject* args) {
int debug;
parser_object* p = (parser_object*)self;
if (!PyArg_ParseTuple(args, "i", &debug)) {
return NULL;
}
yydebug = debug;
debug = htmllexDebug(&(p->scanner), debug);
debug = htmllexDebug(&(self->scanner), debug);
return PyInt_FromLong((long)debug);
}
/* type interface */
static PyMemberDef parser_members[] = {
{"handler", T_OBJECT_EX, offsetof(parser_object, handler), 0,
"handler class"},
{NULL} /* Sentinel */
};
static PyMethodDef parser_methods[] = {
{"feed", parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", parser_debug, METH_VARARGS, "set debug level"},
{"lineno", parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", parser_column, METH_VARARGS, "get the current column"},
{"last_column", parser_last_column, METH_VARARGS, "get the last column"},
{"pos", parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL, NULL}
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL} /* Sentinel */
};
static PyObject* parser_getattr(PyObject* self, char* name) {
return Py_FindMethod(parser_methods, self, name);
}
statichere PyTypeObject parser_type = {
static PyTypeObject parser_type = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
"parser", /* tp_name */
0, /* ob_size */
"htmlsax.parser", /* tp_name */
sizeof(parser_object), /* tp_size */
0, /* tp_itemsize */
0, /* tp_itemsize */
/* methods */
(destructor)parser_dealloc, /* tp_dealloc */
0, /* tp_print */
(getattrfunc)parser_getattr, /* tp_getattr */
0 /* tp_setattr */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_HAVE_GC, /* tp_flags */
"HTML parser object", /* tp_doc */
(traverseproc)parser_traverse, /* tp_traverse */
(inquiry)parser_clear, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
parser_methods, /* tp_methods */
parser_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)parser_init, /* tp_init */
0, /* tp_alloc */
parser_new, /* tp_new */
};
/* python module interface */
/* python module interface
"Create a new HTML parser object with given handler.\n"
"\n"
"Used callbacks (they don't have to be defined) of a handler are:\n"
"comment(data): <!--data-->\n"
"startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
"endElement(tag): </tag>\n"
"doctype(data): <!DOCTYPE data?>\n"
"pi(name, data=None): <?name data?>\n"
"cdata(data): <![CDATA[data]]>\n"
"characters(data): data\n"
"\n"
"Additionally, there are error and warning callbacks:\n"
"error(msg)\n"
"warning(msg)\n"
"fatalError(msg)\n"},
*/
static PyMethodDef htmlsax_methods[] = {
{"parser", htmlsax_parser_new, METH_VARARGS,
"Create a new HTML parser object."},
{NULL, NULL}
{NULL} /* Sentinel */
};
/* initialization of the htmlsaxhtmlop module */
void inithtmlsax(void) {
if (!Py_InitModule("htmlsax", htmlsax_methods)) {
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
/* initialization of the htmlsax module */
PyMODINIT_FUNC inithtmlsax (void) {
PyObject* m;
if (PyType_Ready(&parser_type) < 0) {
return;
}
if ((m = Py_InitModule3("htmlsax", htmlsax_methods, "SAX HTML parser routines"))==NULL) {
return;
}
Py_INCREF(&parser_type);
if (PyModule_AddObject(m, "parser", (PyObject *)&parser_type)==-1) {
/* init error */
PyErr_Print();
}
}

View file

@ -17,7 +17,9 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* Python module definition of a SAX html parser */
#include "htmlsax.h"
#include "structmember.h"
#include <string.h>
#include <stdio.h>
@ -65,6 +67,12 @@ static int yyerror (char* msg) {
if (b==NULL) return NULL; \
(b)[0] = '\0'
/* clear buffer b, returning NULL and decref self on error */
#define CLEAR_BUF_DECREF(self, b) \
b = PyMem_Resize(b, char, 1); \
if (b==NULL) { Py_DECREF(self); return NULL; } \
(b)[0] = '\0'
/* call error handler if error object is not NULL */
#define CHECK_ERROR(ud, label) \
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
@ -94,6 +102,7 @@ static int yyerror (char* msg) {
/* parser type definition */
typedef struct {
PyObject_HEAD
PyObject* handler;
UserData* userData;
void* scanner;
} parser_object;
@ -405,87 +414,152 @@ finish_characters:
#undef realloc
#undef free
/* create parser */
static PyObject* htmlsax_parser_new(PyObject* self, PyObject* args) {
PyObject* handler;
parser_object* p;
if (!PyArg_ParseTuple(args, "O", &handler)) {
PyErr_SetString(PyExc_TypeError, "SAX2 handler object arg required");
return NULL;
/* create parser object */
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
parser_object* self;
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
{
return NULL;
}
p = PyObject_New(parser_object, &parser_type);
if (!p) {
PyErr_SetString(PyExc_TypeError, "Allocating parser object failed");
return NULL;
self->handler = NULL;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
if (self->userData == NULL)
{
Py_DECREF(self);
return NULL;
}
self->userData->handler = NULL;
self->userData->buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->buf);
self->userData->nextpos = 0;
self->userData->bufpos = 0;
self->userData->pos = 0;
self->userData->column = 1;
self->userData->last_column = 1;
self->userData->lineno = 1;
self->userData->last_lineno = 1;
self->userData->tmp_buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
self->userData->tmp_tag = self->userData->tmp_attrname =
self->userData->tmp_attrval = self->userData->tmp_attrs =
self->userData->lexbuf = NULL;
self->userData->exc_type = NULL;
self->userData->exc_val = NULL;
self->userData->exc_tb = NULL;
self->userData->error = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0)
{
Py_DECREF(self);
return NULL;
}
return (PyObject*) self;
}
/* initialize parser object */
static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
PyObject* handler;
if (!PyArg_ParseTuple(args, "O", &handler)) {
return -1;
}
Py_INCREF(handler);
/* reset userData */
p->userData = PyMem_New(UserData, sizeof(UserData));
p->userData->handler = handler;
p->userData->buf = NULL;
CLEAR_BUF(p->userData->buf);
p->userData->nextpos =
p->userData->bufpos =
p->userData->pos =
p->userData->pos = 0;
p->userData->column =
p->userData->last_column =
p->userData->lineno =
p->userData->last_lineno = 1;
p->userData->tmp_buf = NULL;
CLEAR_BUF(p->userData->tmp_buf);
p->userData->tmp_tag = p->userData->tmp_attrname =
p->userData->tmp_attrval = p->userData->tmp_attrs =
p->userData->lexbuf = NULL;
p->userData->exc_type = NULL;
p->userData->exc_val = NULL;
p->userData->exc_tb = NULL;
p->userData->error = NULL;
p->scanner = NULL;
htmllexInit(&(p->scanner), p->userData);
return (PyObject*) p;
self->handler = handler;
self->userData->handler = self->handler;
return 0;
}
static void parser_dealloc (PyObject* self) {
parser_object* p = (parser_object*)self;
htmllexDestroy(p->scanner);
Py_DECREF(p->userData->handler);
PyMem_Del(p->userData->buf);
PyMem_Del(p->userData->tmp_buf);
PyMem_Del(p->userData);
PyMem_DEL(p);
/* traverse all used subobjects participating in reference cycles */
static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
if (self->handler && visit(self->handler, arg) < 0) {
return -1;
}
return 0;
}
/* flush parser buffers, isueing any remaining data as character data */
static PyObject* parser_flush (PyObject* self, PyObject* args) {
/* clear all used subobjects participating in reference cycles */
static int parser_clear (parser_object* self) {
Py_XDECREF(self->handler);
self->handler = NULL;
self->userData->handler = NULL;
return 0;
}
/* free all allocated resources of parser object */
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
parser_clear(self);
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
self->ob_type->tp_free((PyObject*)self);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed (parser_object* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(self->scanner)!=0) {
if (self->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(self->userData->exc_type,
self->userData->exc_val,
self->userData->exc_tb);
}
htmllexStop(self->scanner, self->userData);
return NULL;
}
if (htmllexStop(self->scanner, self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
Py_RETURN_NONE;
}
/* flush all parser buffers */
static PyObject* parser_flush (parser_object* self, PyObject* args) {
int res = 0;
parser_object* p = (parser_object*)self;
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
/* reset parser variables */
CLEAR_BUF(p->userData->tmp_buf);
Py_XDECREF(p->userData->tmp_tag);
Py_XDECREF(p->userData->tmp_attrs);
Py_XDECREF(p->userData->tmp_attrval);
Py_XDECREF(p->userData->tmp_attrname);
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->userData->bufpos = 0;
if (strlen(p->userData->buf)) {
CLEAR_BUF(self->userData->tmp_buf);
Py_XDECREF(self->userData->tmp_tag);
Py_XDECREF(self->userData->tmp_attrs);
Py_XDECREF(self->userData->tmp_attrval);
Py_XDECREF(self->userData->tmp_attrname);
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->userData->bufpos = 0;
if (strlen(self->userData->buf)) {
// XXX set line, col
int error = 0;
PyObject* s = PyString_FromString(p->userData->buf);
PyObject* s = PyString_FromString(self->userData->buf);
PyObject* callback = NULL;
PyObject* result = NULL;
/* reset buffer */
CLEAR_BUF(p->userData->buf);
CLEAR_BUF(self->userData->buf);
if (s==NULL) { error=1; goto finish_flush; }
if (PyObject_HasAttrString(p->userData->handler, "characters")==1) {
callback = PyObject_GetAttrString(p->userData->handler, "characters");
if (PyObject_HasAttrString(self->handler, "characters")==1) {
callback = PyObject_GetAttrString(self->handler, "characters");
if (callback==NULL) { error=1; goto finish_flush; }
result = PyObject_CallFunction(callback, "O", s);
if (result==NULL) { error=1; goto finish_flush; }
@ -493,194 +567,223 @@ static PyObject* parser_flush (PyObject* self, PyObject* args) {
finish_flush:
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(s);
Py_XDECREF(s);
if (error==1) {
return NULL;
return NULL;
}
}
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
return Py_BuildValue("i", res);
}
/* return the current parser line number */
static PyObject* parser_lineno (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->lineno);
return Py_BuildValue("i", self->userData->lineno);
}
/* return the last parser line number */
static PyObject* parser_last_lineno (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->last_lineno);
return Py_BuildValue("i", self->userData->last_lineno);
}
/* return the current parser column number */
static PyObject* parser_column (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->column);
return Py_BuildValue("i", self->userData->column);
}
/* return the last parser column number */
static PyObject* parser_last_column (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_last_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->last_column);
return Py_BuildValue("i", self->userData->last_column);
}
static PyObject* parser_pos (PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
/* return the parser position in data stream */
static PyObject* parser_pos (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", p->userData->pos);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed(PyObject* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
parser_object* p = (parser_object*)self;
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(p->scanner, p->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(p->scanner)!=0) {
if (p->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(p->userData->exc_type,
p->userData->exc_val,
p->userData->exc_tb);
}
htmllexStop(p->scanner, p->userData);
return NULL;
}
if (htmllexStop(p->scanner, p->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
return Py_BuildValue("i", self->userData->pos);
}
/* reset the parser. This will erase all buffered data! */
static PyObject* parser_reset(PyObject* self, PyObject* args) {
parser_object* p = (parser_object*)self;
static PyObject* parser_reset (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
if (htmllexDestroy(p->scanner)!=0) {
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
/* reset buffer */
CLEAR_BUF(p->userData->buf);
CLEAR_BUF(p->userData->tmp_buf);
p->userData->bufpos =
p->userData->pos =
p->userData->nextpos = 0;
p->userData->column =
p->userData->last_column =
p->userData->lineno =
p->userData->last_lineno = 1;
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->scanner = NULL;
if (htmllexInit(&(p->scanner), p->userData)!=0) {
CLEAR_BUF(self->userData->buf);
CLEAR_BUF(self->userData->tmp_buf);
self->userData->bufpos =
self->userData->pos =
self->userData->nextpos = 0;
self->userData->column =
self->userData->last_column =
self->userData->lineno =
self->userData->last_lineno = 1;
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
Py_INCREF(Py_None);
return Py_None;
Py_RETURN_NONE;
}
/* set the debug level, if its >0, debugging is on, =0 means off */
static PyObject* parser_debug(PyObject* self, PyObject* args) {
static PyObject* parser_debug (parser_object* self, PyObject* args) {
int debug;
parser_object* p = (parser_object*)self;
if (!PyArg_ParseTuple(args, "i", &debug)) {
return NULL;
}
yydebug = debug;
debug = htmllexDebug(&(p->scanner), debug);
debug = htmllexDebug(&(self->scanner), debug);
return PyInt_FromLong((long)debug);
}
/* type interface */
static PyMemberDef parser_members[] = {
{"handler", T_OBJECT_EX, offsetof(parser_object, handler), 0,
"handler class"},
{NULL} /* Sentinel */
};
static PyMethodDef parser_methods[] = {
{"feed", parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", parser_debug, METH_VARARGS, "set debug level"},
{"lineno", parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", parser_column, METH_VARARGS, "get the current column"},
{"last_column", parser_last_column, METH_VARARGS, "get the last column"},
{"pos", parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL, NULL}
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL} /* Sentinel */
};
static PyObject* parser_getattr(PyObject* self, char* name) {
return Py_FindMethod(parser_methods, self, name);
}
statichere PyTypeObject parser_type = {
static PyTypeObject parser_type = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
"parser", /* tp_name */
0, /* ob_size */
"htmlsax.parser", /* tp_name */
sizeof(parser_object), /* tp_size */
0, /* tp_itemsize */
0, /* tp_itemsize */
/* methods */
(destructor)parser_dealloc, /* tp_dealloc */
0, /* tp_print */
(getattrfunc)parser_getattr, /* tp_getattr */
0 /* tp_setattr */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_HAVE_GC, /* tp_flags */
"HTML parser object", /* tp_doc */
(traverseproc)parser_traverse, /* tp_traverse */
(inquiry)parser_clear, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
parser_methods, /* tp_methods */
parser_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)parser_init, /* tp_init */
0, /* tp_alloc */
parser_new, /* tp_new */
};
/* python module interface */
/* python module interface
"Create a new HTML parser object with given handler.\n"
"\n"
"Used callbacks (they don't have to be defined) of a handler are:\n"
"comment(data): <!--data-->\n"
"startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
"endElement(tag): </tag>\n"
"doctype(data): <!DOCTYPE data?>\n"
"pi(name, data=None): <?name data?>\n"
"cdata(data): <![CDATA[data]]>\n"
"characters(data): data\n"
"\n"
"Additionally, there are error and warning callbacks:\n"
"error(msg)\n"
"warning(msg)\n"
"fatalError(msg)\n"},
*/
static PyMethodDef htmlsax_methods[] = {
{"parser", htmlsax_parser_new, METH_VARARGS,
"Create a new HTML parser object."},
{NULL, NULL}
{NULL} /* Sentinel */
};
/* initialization of the htmlsaxhtmlop module */
void inithtmlsax(void) {
if (!Py_InitModule("htmlsax", htmlsax_methods)) {
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
/* initialization of the htmlsax module */
PyMODINIT_FUNC inithtmlsax (void) {
PyObject* m;
if (PyType_Ready(&parser_type) < 0) {
return;
}
if ((m = Py_InitModule3("htmlsax", htmlsax_methods, "SAX HTML parser routines"))==NULL) {
return;
}
Py_INCREF(&parser_type);
if (PyModule_AddObject(m, "parser", (PyObject *)&parser_type)==-1) {
/* init error */
PyErr_Print();
}
}

View file

@ -29,6 +29,11 @@
#error please install Python >= 2.3
#endif
/* this will be in Python 2.4 */
#ifndef Py_RETURN_NONE
#define Py_RETURN_NONE do {Py_INCREF(Py_None); return Py_None;} while (0)
#endif
/* user_data type for SAX calls */
typedef struct {
/* the Python SAX class instance to issue callbacks */
@ -52,12 +57,17 @@ typedef struct {
unsigned int column;
/* last value of column counter */
unsigned int last_column;
/* temporary vars */
/* input buffer of lexer, must be deleted when the parsing stops */
void* lexbuf;
/* temporary character buffer */
char* tmp_buf;
/* temporary HTML start or end tag name */
PyObject* tmp_tag;
/* temporary HTML start tag attribute name */
PyObject* tmp_attrname;
/* temporary HTML start tag attribute value */
PyObject* tmp_attrval;
/* temporary HTML start tag attribute list */
PyObject* tmp_attrs;
/* stored Python exception (if error occurred in scanner) */
PyObject* exc_type;