From a973d6e49c8c878e618eaeeefab4d7c63146833c Mon Sep 17 00:00:00 2001 From: calvin Date: Wed, 30 Apr 2003 13:57:28 +0000 Subject: [PATCH] sync with WebCleaner parser, minor cleanups git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@872 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/parser/htmlparse.c | 232 +++++++++++++++++++---------------- linkcheck/parser/htmlparse.y | 188 +++++++++++++++------------- 2 files changed, 224 insertions(+), 196 deletions(-) diff --git a/linkcheck/parser/htmlparse.c b/linkcheck/parser/htmlparse.c index 88d83698..3afe7480 100644 --- a/linkcheck/parser/htmlparse.c +++ b/linkcheck/parser/htmlparse.c @@ -98,14 +98,15 @@ /* extern functions found in htmllex.l */ extern int yylex(YYSTYPE* yylvalp, void* scanner); extern int htmllexInit (void** scanner, UserData* data); +extern int htmllexDebug (void** scanner, int debug); extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen); extern int htmllexStop (void* scanner, UserData* data); extern int htmllexDestroy (void* scanner); extern void* yyget_extra(void*); extern int yyget_lineno(void*); #define YYERROR_VERBOSE 1 -/* standard error reporting, indicating an internal error */ +/* standard error reporting, indicating an internal error */ static int yyerror (char* msg) { fprintf(stderr, "htmlsax: internal parse error: %s\n", msg); return 0; @@ -184,7 +185,7 @@ staticforward PyTypeObject parser_type; /* Enabling traces. */ #ifndef YYDEBUG -# define YYDEBUG 0 +# define YYDEBUG 1 #endif /* Enabling verbose error messages. */ @@ -208,7 +209,7 @@ typedef int YYSTYPE; /* Line 214 of yacc.c. */ -#line 212 "htmlparse.c" +#line 213 "htmlparse.c" #if ! defined (yyoverflow) || YYERROR_VERBOSE @@ -378,8 +379,8 @@ static const yysigned_char yyrhs[] = /* YYRLINE[YYN] -- source line where rule number YYN was defined. */ static const unsigned short yyrline[] = { - 0, 123, 123, 124, 127, 128, 135, 169, 215, 245, - 265, 285, 305, 325, 346, 367 + 0, 124, 124, 125, 128, 129, 136, 170, 216, 246, + 266, 286, 306, 326, 347, 368 }; #endif @@ -1084,22 +1085,22 @@ yyreduce: switch (yyn) { case 2: -#line 123 "htmlparse.y" - {;} - break; - - case 3: #line 124 "htmlparse.y" {;} break; + case 3: +#line 125 "htmlparse.y" + {;} + break; + case 4: -#line 127 "htmlparse.y" +#line 128 "htmlparse.y" { YYACCEPT; /* wait for more lexer input */ ;} break; case 5: -#line 129 "htmlparse.y" +#line 130 "htmlparse.y" { /* an error occured in the scanner, the python exception must be set */ UserData* ud = yyget_extra(scanner); @@ -1109,7 +1110,7 @@ yyreduce: break; case 6: -#line 136 "htmlparse.y" +#line 137 "htmlparse.y" { /* $1 is a tuple (, ); is a dictionary */ UserData* ud = yyget_extra(scanner); @@ -1146,7 +1147,7 @@ finish_start: break; case 7: -#line 170 "htmlparse.y" +#line 171 "htmlparse.y" { /* $1 is a tuple (, ); is a dictionary */ UserData* ud = yyget_extra(scanner); @@ -1195,7 +1196,7 @@ finish_start_end: break; case 8: -#line 216 "htmlparse.y" +#line 217 "htmlparse.y" { UserData* ud = yyget_extra(scanner); PyObject* callback = NULL; @@ -1228,7 +1229,7 @@ finish_end: break; case 9: -#line 246 "htmlparse.y" +#line 247 "htmlparse.y" { UserData* ud = yyget_extra(scanner); PyObject* callback = NULL; @@ -1251,7 +1252,7 @@ finish_comment: break; case 10: -#line 266 "htmlparse.y" +#line 267 "htmlparse.y" { UserData* ud = yyget_extra(scanner); PyObject* callback = NULL; @@ -1274,7 +1275,7 @@ finish_pi: break; case 11: -#line 286 "htmlparse.y" +#line 287 "htmlparse.y" { UserData* ud = yyget_extra(scanner); PyObject* callback = NULL; @@ -1297,7 +1298,7 @@ finish_cdata: break; case 12: -#line 306 "htmlparse.y" +#line 307 "htmlparse.y" { UserData* ud = yyget_extra(scanner); PyObject* callback = NULL; @@ -1320,7 +1321,7 @@ finish_doctype: break; case 13: -#line 326 "htmlparse.y" +#line 327 "htmlparse.y" { UserData* ud = yyget_extra(scanner); PyObject* callback = NULL; @@ -1344,7 +1345,7 @@ finish_script: break; case 14: -#line 347 "htmlparse.y" +#line 348 "htmlparse.y" { UserData* ud = yyget_extra(scanner); PyObject* callback = NULL; @@ -1368,7 +1369,7 @@ finish_style: break; case 15: -#line 368 "htmlparse.y" +#line 369 "htmlparse.y" { /* Remember this is also called as a lexer error fallback */ UserData* ud = yyget_extra(scanner); @@ -1395,7 +1396,7 @@ finish_characters: } /* Line 999 of yacc.c. */ -#line 1399 "htmlparse.c" +#line 1400 "htmlparse.c" yyvsp -= yylen; yyssp -= yylen; @@ -1589,7 +1590,7 @@ yyreturn: } -#line 390 "htmlparse.y" +#line 391 "htmlparse.y" /* disable python memory interface */ @@ -1598,18 +1599,19 @@ yyreturn: #undef free /* create parser */ -static PyObject* htmlsax_parser(PyObject* self, PyObject* args) { +static PyObject* htmlsax_parser_new(PyObject* self, PyObject* args) { PyObject* handler; parser_object* p; if (!PyArg_ParseTuple(args, "O", &handler)) { PyErr_SetString(PyExc_TypeError, "SAX2 handler object arg required"); return NULL; } - Py_INCREF(handler); - if (!(p=PyObject_NEW(parser_object, &parser_type))) { + p = PyObject_New(parser_object, &parser_type); + if (!p) { PyErr_SetString(PyExc_TypeError, "Allocating parser object failed"); return NULL; } + Py_INCREF(handler); /* reset userData */ p->userData = PyMem_New(UserData, sizeof(UserData)); p->userData->handler = handler; @@ -1636,56 +1638,54 @@ static PyObject* htmlsax_parser(PyObject* self, PyObject* args) { } -static void parser_dealloc (parser_object* self) { - htmllexDestroy(self->scanner); - Py_DECREF(self->userData->handler); - PyMem_Del(self->userData->buf); - PyMem_Del(self->userData->tmp_buf); - PyMem_Del(self->userData); - PyMem_DEL(self); +static void parser_dealloc (PyObject* self) { + parser_object* p = (parser_object*)self; + htmllexDestroy(p->scanner); + Py_DECREF(p->userData->handler); + PyMem_Del(p->userData->buf); + PyMem_Del(p->userData->tmp_buf); + PyMem_Del(p->userData); + PyMem_DEL(p); } /* flush parser buffers, isueing any remaining data as character data */ -static PyObject* parser_flush (parser_object* self, PyObject* args) { +static PyObject* parser_flush (PyObject* self, PyObject* args) { int res=0; - int len = strlen(self->userData->buf); if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - /* update internal parser variables */ - if (len > self->userData->bufpos) { - self->userData->pos += len; - } - RESIZE_BUF(self->userData->tmp_buf); - Py_XDECREF(self->userData->tmp_tag); - Py_XDECREF(self->userData->tmp_attrs); - Py_XDECREF(self->userData->tmp_attrval); - Py_XDECREF(self->userData->tmp_attrname); - self->userData->tmp_tag = self->userData->tmp_attrs = - self->userData->tmp_attrval = self->userData->tmp_attrname = NULL; - if (len > 0) { + parser_object* p = (parser_object*)self; + /* reset parser variables */ + RESIZE_BUF(p->userData->tmp_buf); + Py_XDECREF(p->userData->tmp_tag); + Py_XDECREF(p->userData->tmp_attrs); + Py_XDECREF(p->userData->tmp_attrval); + Py_XDECREF(p->userData->tmp_attrname); + p->userData->tmp_tag = p->userData->tmp_attrs = + p->userData->tmp_attrval = p->userData->tmp_attrname = NULL; + p->userData->bufpos = 0; + if (strlen(p->userData->buf)) { // XXX set line, col int error = 0; - PyObject* s = PyString_FromString(self->userData->buf); + PyObject* s = PyString_FromString(p->userData->buf); PyObject* callback = NULL; PyObject* result = NULL; /* reset buffer */ - RESIZE_BUF(self->userData->buf); + RESIZE_BUF(p->userData->buf); if (s==NULL) { error=1; goto finish_flush; } - self->userData->bufpos = self->userData->nextpos = 0; - if (PyObject_HasAttrString(self->userData->handler, "characters")==1) { - callback = PyObject_GetAttrString(self->userData->handler, "characters"); + if (PyObject_HasAttrString(p->userData->handler, "characters")==1) { + callback = PyObject_GetAttrString(p->userData->handler, "characters"); if (callback==NULL) { error=1; goto finish_flush; } result = PyObject_CallFunction(callback, "O", s); if (result==NULL) { error=1; goto finish_flush; } } finish_flush: - Py_XDECREF(s); Py_XDECREF(callback); Py_XDECREF(result); - if (error) { + Py_XDECREF(s); + if (error==1) { return NULL; } } @@ -1694,56 +1694,61 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) { /* return the current parser line number */ -static PyObject* parser_lineno (parser_object* self, PyObject* args) { +static PyObject* parser_lineno (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->lineno); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->lineno); } /* return the last parser line number */ -static PyObject* parser_last_lineno (parser_object* self, PyObject* args) { +static PyObject* parser_last_lineno (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->last_lineno); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->last_lineno); } /* return the current parser column number */ -static PyObject* parser_column (parser_object* self, PyObject* args) { +static PyObject* parser_column (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->column); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->column); } /* return the last parser column number */ -static PyObject* parser_last_column (parser_object* self, PyObject* args) { +static PyObject* parser_last_column (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->last_column); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->last_column); } -static PyObject* parser_pos (parser_object* self, PyObject* args) { +static PyObject* parser_pos (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->pos); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->pos); } /* feed a chunk of data to the parser */ -static PyObject* parser_feed (parser_object* self, PyObject* args) { +static PyObject* parser_feed(PyObject* self, PyObject* args) { /* set up the parse string */ int slen = 0; char* s = NULL; @@ -1751,22 +1756,23 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) { PyErr_SetString(PyExc_TypeError, "string arg required"); return NULL; } - /* parse */ - if (htmllexStart(self->scanner, self->userData, s, slen)!=0) { + parser_object* p = (parser_object*)self; + if (htmllexStart(p->scanner, p->userData, s, slen)!=0) { PyErr_SetString(PyExc_MemoryError, "could not start scanner"); return NULL; } - if (yyparse(self->scanner)!=0) { - if (self->userData->exc_type!=NULL) { - /* note: we give away these objects, so dont decref */ - PyErr_Restore(self->userData->exc_type, - self->userData->exc_val, - self->userData->exc_tb); + if (yyparse(p->scanner)!=0) { + if (p->userData->exc_type!=NULL) { + /* note: we give away these objects, so don't decref */ + PyErr_Restore(p->userData->exc_type, + p->userData->exc_val, + p->userData->exc_tb); } + htmllexStop(p->scanner, p->userData); return NULL; } - if (htmllexStop(self->scanner, self->userData)!=0) { + if (htmllexStop(p->scanner, p->userData)!=0) { PyErr_SetString(PyExc_MemoryError, "could not stop scanner"); return NULL; } @@ -1776,29 +1782,30 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) { /* reset the parser. This will erase all buffered data! */ -static PyObject* parser_reset(parser_object* self, PyObject* args) { +static PyObject* parser_reset(PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - if (htmllexDestroy(self->scanner)!=0) { + parser_object* p = (parser_object*)self; + if (htmllexDestroy(p->scanner)!=0) { PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data"); return NULL; } /* reset buffer */ - RESIZE_BUF(self->userData->buf); - RESIZE_BUF(self->userData->tmp_buf); - self->userData->bufpos = - self->userData->pos = - self->userData->nextpos = 0; - self->userData->column = - self->userData->last_column = - self->userData->lineno = - self->userData->last_lineno = 1; - self->userData->tmp_tag = self->userData->tmp_attrs = - self->userData->tmp_attrval = self->userData->tmp_attrname = NULL; - self->scanner = NULL; - if (htmllexInit(&(self->scanner), self->userData)!=0) { + RESIZE_BUF(p->userData->buf); + RESIZE_BUF(p->userData->tmp_buf); + p->userData->bufpos = + p->userData->pos = + p->userData->nextpos = 0; + p->userData->column = + p->userData->last_column = + p->userData->lineno = + p->userData->last_lineno = 1; + p->userData->tmp_tag = p->userData->tmp_attrs = + p->userData->tmp_attrval = p->userData->tmp_attrname = NULL; + p->scanner = NULL; + if (htmllexInit(&(p->scanner), p->userData)!=0) { PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data"); return NULL; } @@ -1807,30 +1814,36 @@ static PyObject* parser_reset(parser_object* self, PyObject* args) { } +/* set the debug level, if its >0, debugging is on, =0 means off */ +static PyObject* parser_debug(PyObject* self, PyObject* args) { + int debug; + if (!PyArg_ParseTuple(args, "i", &debug)) { + return NULL; + } + yydebug = debug; + parser_object* p = (parser_object*)self; + debug = htmllexDebug(&(p->scanner), debug); + return PyInt_FromLong((long)debug); +} + + /* type interface */ static PyMethodDef parser_methods[] = { - /* incremental parsing */ - {"feed", (PyCFunction) parser_feed, METH_VARARGS}, - /* reset the parser (no flushing) */ - {"reset", (PyCFunction) parser_reset, METH_VARARGS}, - /* flush the parser buffers */ - {"flush", (PyCFunction) parser_flush, METH_VARARGS}, - /* get the current line number */ - {"lineno", (PyCFunction) parser_lineno, METH_VARARGS}, - /* get the last line number */ - {"last_lineno", (PyCFunction) parser_last_lineno, METH_VARARGS}, - /* get the current column */ - {"column", (PyCFunction) parser_column, METH_VARARGS}, - /* get the last column */ - {"last_column", (PyCFunction) parser_last_column, METH_VARARGS}, - /* get the current scanner position */ - {"pos", (PyCFunction) parser_pos, METH_VARARGS}, + {"feed", parser_feed, METH_VARARGS, "feed data to parse incremental"}, + {"reset", parser_reset, METH_VARARGS, "reset the parser (no flushing)"}, + {"flush", parser_flush, METH_VARARGS, "flush parser buffers"}, + {"debug", parser_debug, METH_VARARGS, "set debug level"}, + {"lineno", parser_lineno, METH_VARARGS, "get the current line number"}, + {"last_lineno", parser_last_lineno, METH_VARARGS, "get the last line number"}, + {"column", parser_column, METH_VARARGS, "get the current column"}, + {"last_column", parser_last_column, METH_VARARGS, "get the last column"}, + {"pos", parser_pos, METH_VARARGS, "get the current scanner position"}, {NULL, NULL} }; -static PyObject* parser_getattr(parser_object* self, char* name) { - return Py_FindMethod(parser_methods, (PyObject*) self, name); +static PyObject* parser_getattr(PyObject* self, char* name) { + return Py_FindMethod(parser_methods, self, name); } @@ -1850,7 +1863,8 @@ statichere PyTypeObject parser_type = { /* python module interface */ static PyMethodDef htmlsax_methods[] = { - {"parser", htmlsax_parser, METH_VARARGS}, + {"parser", htmlsax_parser_new, METH_VARARGS, + "Create a new HTML parser object."}, {NULL, NULL} }; diff --git a/linkcheck/parser/htmlparse.y b/linkcheck/parser/htmlparse.y index 1b141b49..c4272ec7 100644 --- a/linkcheck/parser/htmlparse.y +++ b/linkcheck/parser/htmlparse.y @@ -12,14 +12,15 @@ /* extern functions found in htmllex.l */ extern int yylex(YYSTYPE* yylvalp, void* scanner); extern int htmllexInit (void** scanner, UserData* data); +extern int htmllexDebug (void** scanner, int debug); extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen); extern int htmllexStop (void* scanner, UserData* data); extern int htmllexDestroy (void* scanner); extern void* yyget_extra(void*); extern int yyget_lineno(void*); #define YYERROR_VERBOSE 1 -/* standard error reporting, indicating an internal error */ +/* standard error reporting, indicating an internal error */ static int yyerror (char* msg) { fprintf(stderr, "htmlsax: internal parse error: %s\n", msg); return 0; @@ -98,7 +99,7 @@ staticforward PyTypeObject parser_type; /* parser options */ %verbose -/*%debug*/ +%debug %defines %output="htmlparse.c" %pure_parser @@ -395,18 +396,19 @@ finish_characters: #undef free /* create parser */ -static PyObject* htmlsax_parser(PyObject* self, PyObject* args) { +static PyObject* htmlsax_parser_new(PyObject* self, PyObject* args) { PyObject* handler; parser_object* p; if (!PyArg_ParseTuple(args, "O", &handler)) { PyErr_SetString(PyExc_TypeError, "SAX2 handler object arg required"); return NULL; } - Py_INCREF(handler); - if (!(p=PyObject_NEW(parser_object, &parser_type))) { + p = PyObject_New(parser_object, &parser_type); + if (!p) { PyErr_SetString(PyExc_TypeError, "Allocating parser object failed"); return NULL; } + Py_INCREF(handler); /* reset userData */ p->userData = PyMem_New(UserData, sizeof(UserData)); p->userData->handler = handler; @@ -433,56 +435,54 @@ static PyObject* htmlsax_parser(PyObject* self, PyObject* args) { } -static void parser_dealloc (parser_object* self) { - htmllexDestroy(self->scanner); - Py_DECREF(self->userData->handler); - PyMem_Del(self->userData->buf); - PyMem_Del(self->userData->tmp_buf); - PyMem_Del(self->userData); - PyMem_DEL(self); +static void parser_dealloc (PyObject* self) { + parser_object* p = (parser_object*)self; + htmllexDestroy(p->scanner); + Py_DECREF(p->userData->handler); + PyMem_Del(p->userData->buf); + PyMem_Del(p->userData->tmp_buf); + PyMem_Del(p->userData); + PyMem_DEL(p); } /* flush parser buffers, isueing any remaining data as character data */ -static PyObject* parser_flush (parser_object* self, PyObject* args) { +static PyObject* parser_flush (PyObject* self, PyObject* args) { int res=0; - int len = strlen(self->userData->buf); if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - /* update internal parser variables */ - if (len > self->userData->bufpos) { - self->userData->pos += len; - } - RESIZE_BUF(self->userData->tmp_buf); - Py_XDECREF(self->userData->tmp_tag); - Py_XDECREF(self->userData->tmp_attrs); - Py_XDECREF(self->userData->tmp_attrval); - Py_XDECREF(self->userData->tmp_attrname); - self->userData->tmp_tag = self->userData->tmp_attrs = - self->userData->tmp_attrval = self->userData->tmp_attrname = NULL; - if (len > 0) { + parser_object* p = (parser_object*)self; + /* reset parser variables */ + RESIZE_BUF(p->userData->tmp_buf); + Py_XDECREF(p->userData->tmp_tag); + Py_XDECREF(p->userData->tmp_attrs); + Py_XDECREF(p->userData->tmp_attrval); + Py_XDECREF(p->userData->tmp_attrname); + p->userData->tmp_tag = p->userData->tmp_attrs = + p->userData->tmp_attrval = p->userData->tmp_attrname = NULL; + p->userData->bufpos = 0; + if (strlen(p->userData->buf)) { // XXX set line, col int error = 0; - PyObject* s = PyString_FromString(self->userData->buf); + PyObject* s = PyString_FromString(p->userData->buf); PyObject* callback = NULL; PyObject* result = NULL; /* reset buffer */ - RESIZE_BUF(self->userData->buf); + RESIZE_BUF(p->userData->buf); if (s==NULL) { error=1; goto finish_flush; } - self->userData->bufpos = self->userData->nextpos = 0; - if (PyObject_HasAttrString(self->userData->handler, "characters")==1) { - callback = PyObject_GetAttrString(self->userData->handler, "characters"); + if (PyObject_HasAttrString(p->userData->handler, "characters")==1) { + callback = PyObject_GetAttrString(p->userData->handler, "characters"); if (callback==NULL) { error=1; goto finish_flush; } result = PyObject_CallFunction(callback, "O", s); if (result==NULL) { error=1; goto finish_flush; } } finish_flush: - Py_XDECREF(s); Py_XDECREF(callback); Py_XDECREF(result); - if (error) { + Py_XDECREF(s); + if (error==1) { return NULL; } } @@ -491,56 +491,61 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) { /* return the current parser line number */ -static PyObject* parser_lineno (parser_object* self, PyObject* args) { +static PyObject* parser_lineno (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->lineno); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->lineno); } /* return the last parser line number */ -static PyObject* parser_last_lineno (parser_object* self, PyObject* args) { +static PyObject* parser_last_lineno (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->last_lineno); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->last_lineno); } /* return the current parser column number */ -static PyObject* parser_column (parser_object* self, PyObject* args) { +static PyObject* parser_column (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->column); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->column); } /* return the last parser column number */ -static PyObject* parser_last_column (parser_object* self, PyObject* args) { +static PyObject* parser_last_column (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->last_column); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->last_column); } -static PyObject* parser_pos (parser_object* self, PyObject* args) { +static PyObject* parser_pos (PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - return Py_BuildValue("i", self->userData->pos); + parser_object* p = (parser_object*)self; + return Py_BuildValue("i", p->userData->pos); } /* feed a chunk of data to the parser */ -static PyObject* parser_feed (parser_object* self, PyObject* args) { +static PyObject* parser_feed(PyObject* self, PyObject* args) { /* set up the parse string */ int slen = 0; char* s = NULL; @@ -548,22 +553,23 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) { PyErr_SetString(PyExc_TypeError, "string arg required"); return NULL; } - /* parse */ - if (htmllexStart(self->scanner, self->userData, s, slen)!=0) { + parser_object* p = (parser_object*)self; + if (htmllexStart(p->scanner, p->userData, s, slen)!=0) { PyErr_SetString(PyExc_MemoryError, "could not start scanner"); return NULL; } - if (yyparse(self->scanner)!=0) { - if (self->userData->exc_type!=NULL) { - /* note: we give away these objects, so dont decref */ - PyErr_Restore(self->userData->exc_type, - self->userData->exc_val, - self->userData->exc_tb); + if (yyparse(p->scanner)!=0) { + if (p->userData->exc_type!=NULL) { + /* note: we give away these objects, so don't decref */ + PyErr_Restore(p->userData->exc_type, + p->userData->exc_val, + p->userData->exc_tb); } + htmllexStop(p->scanner, p->userData); return NULL; } - if (htmllexStop(self->scanner, self->userData)!=0) { + if (htmllexStop(p->scanner, p->userData)!=0) { PyErr_SetString(PyExc_MemoryError, "could not stop scanner"); return NULL; } @@ -573,29 +579,30 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) { /* reset the parser. This will erase all buffered data! */ -static PyObject* parser_reset(parser_object* self, PyObject* args) { +static PyObject* parser_reset(PyObject* self, PyObject* args) { if (!PyArg_ParseTuple(args, "")) { PyErr_SetString(PyExc_TypeError, "no args required"); return NULL; } - if (htmllexDestroy(self->scanner)!=0) { + parser_object* p = (parser_object*)self; + if (htmllexDestroy(p->scanner)!=0) { PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data"); return NULL; } /* reset buffer */ - RESIZE_BUF(self->userData->buf); - RESIZE_BUF(self->userData->tmp_buf); - self->userData->bufpos = - self->userData->pos = - self->userData->nextpos = 0; - self->userData->column = - self->userData->last_column = - self->userData->lineno = - self->userData->last_lineno = 1; - self->userData->tmp_tag = self->userData->tmp_attrs = - self->userData->tmp_attrval = self->userData->tmp_attrname = NULL; - self->scanner = NULL; - if (htmllexInit(&(self->scanner), self->userData)!=0) { + RESIZE_BUF(p->userData->buf); + RESIZE_BUF(p->userData->tmp_buf); + p->userData->bufpos = + p->userData->pos = + p->userData->nextpos = 0; + p->userData->column = + p->userData->last_column = + p->userData->lineno = + p->userData->last_lineno = 1; + p->userData->tmp_tag = p->userData->tmp_attrs = + p->userData->tmp_attrval = p->userData->tmp_attrname = NULL; + p->scanner = NULL; + if (htmllexInit(&(p->scanner), p->userData)!=0) { PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data"); return NULL; } @@ -604,30 +611,36 @@ static PyObject* parser_reset(parser_object* self, PyObject* args) { } +/* set the debug level, if its >0, debugging is on, =0 means off */ +static PyObject* parser_debug(PyObject* self, PyObject* args) { + int debug; + if (!PyArg_ParseTuple(args, "i", &debug)) { + return NULL; + } + yydebug = debug; + parser_object* p = (parser_object*)self; + debug = htmllexDebug(&(p->scanner), debug); + return PyInt_FromLong((long)debug); +} + + /* type interface */ static PyMethodDef parser_methods[] = { - /* incremental parsing */ - {"feed", (PyCFunction) parser_feed, METH_VARARGS}, - /* reset the parser (no flushing) */ - {"reset", (PyCFunction) parser_reset, METH_VARARGS}, - /* flush the parser buffers */ - {"flush", (PyCFunction) parser_flush, METH_VARARGS}, - /* get the current line number */ - {"lineno", (PyCFunction) parser_lineno, METH_VARARGS}, - /* get the last line number */ - {"last_lineno", (PyCFunction) parser_last_lineno, METH_VARARGS}, - /* get the current column */ - {"column", (PyCFunction) parser_column, METH_VARARGS}, - /* get the last column */ - {"last_column", (PyCFunction) parser_last_column, METH_VARARGS}, - /* get the current scanner position */ - {"pos", (PyCFunction) parser_pos, METH_VARARGS}, + {"feed", parser_feed, METH_VARARGS, "feed data to parse incremental"}, + {"reset", parser_reset, METH_VARARGS, "reset the parser (no flushing)"}, + {"flush", parser_flush, METH_VARARGS, "flush parser buffers"}, + {"debug", parser_debug, METH_VARARGS, "set debug level"}, + {"lineno", parser_lineno, METH_VARARGS, "get the current line number"}, + {"last_lineno", parser_last_lineno, METH_VARARGS, "get the last line number"}, + {"column", parser_column, METH_VARARGS, "get the current column"}, + {"last_column", parser_last_column, METH_VARARGS, "get the last column"}, + {"pos", parser_pos, METH_VARARGS, "get the current scanner position"}, {NULL, NULL} }; -static PyObject* parser_getattr(parser_object* self, char* name) { - return Py_FindMethod(parser_methods, (PyObject*) self, name); +static PyObject* parser_getattr(PyObject* self, char* name) { + return Py_FindMethod(parser_methods, self, name); } @@ -647,7 +660,8 @@ statichere PyTypeObject parser_type = { /* python module interface */ static PyMethodDef htmlsax_methods[] = { - {"parser", htmlsax_parser, METH_VARARGS}, + {"parser", htmlsax_parser_new, METH_VARARGS, + "Create a new HTML parser object."}, {NULL, NULL} };