sync with WebCleaner parser, minor cleanups

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@872 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2003-04-30 13:57:28 +00:00
parent 5e4034a76c
commit a973d6e49c
2 changed files with 224 additions and 196 deletions

View file

@ -98,14 +98,15 @@
/* extern functions found in htmllex.l */
extern int yylex(YYSTYPE* yylvalp, void* scanner);
extern int htmllexInit (void** scanner, UserData* data);
extern int htmllexDebug (void** scanner, int debug);
extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
extern int htmllexStop (void* scanner, UserData* data);
extern int htmllexDestroy (void* scanner);
extern void* yyget_extra(void*);
extern int yyget_lineno(void*);
#define YYERROR_VERBOSE 1
/* standard error reporting, indicating an internal error */
/* standard error reporting, indicating an internal error */
static int yyerror (char* msg) {
fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
return 0;
@ -184,7 +185,7 @@ staticforward PyTypeObject parser_type;
/* Enabling traces. */
#ifndef YYDEBUG
# define YYDEBUG 0
# define YYDEBUG 1
#endif
/* Enabling verbose error messages. */
@ -208,7 +209,7 @@ typedef int YYSTYPE;
/* Line 214 of yacc.c. */
#line 212 "htmlparse.c"
#line 213 "htmlparse.c"
#if ! defined (yyoverflow) || YYERROR_VERBOSE
@ -378,8 +379,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short yyrline[] =
{
0, 123, 123, 124, 127, 128, 135, 169, 215, 245,
265, 285, 305, 325, 346, 367
0, 124, 124, 125, 128, 129, 136, 170, 216, 246,
266, 286, 306, 326, 347, 368
};
#endif
@ -1084,22 +1085,22 @@ yyreduce:
switch (yyn)
{
case 2:
#line 123 "htmlparse.y"
{;}
break;
case 3:
#line 124 "htmlparse.y"
{;}
break;
case 3:
#line 125 "htmlparse.y"
{;}
break;
case 4:
#line 127 "htmlparse.y"
#line 128 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
break;
case 5:
#line 129 "htmlparse.y"
#line 130 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1109,7 +1110,7 @@ yyreduce:
break;
case 6:
#line 136 "htmlparse.y"
#line 137 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
UserData* ud = yyget_extra(scanner);
@ -1146,7 +1147,7 @@ finish_start:
break;
case 7:
#line 170 "htmlparse.y"
#line 171 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
UserData* ud = yyget_extra(scanner);
@ -1195,7 +1196,7 @@ finish_start_end:
break;
case 8:
#line 216 "htmlparse.y"
#line 217 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1228,7 +1229,7 @@ finish_end:
break;
case 9:
#line 246 "htmlparse.y"
#line 247 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1251,7 +1252,7 @@ finish_comment:
break;
case 10:
#line 266 "htmlparse.y"
#line 267 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1274,7 +1275,7 @@ finish_pi:
break;
case 11:
#line 286 "htmlparse.y"
#line 287 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1297,7 +1298,7 @@ finish_cdata:
break;
case 12:
#line 306 "htmlparse.y"
#line 307 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1320,7 +1321,7 @@ finish_doctype:
break;
case 13:
#line 326 "htmlparse.y"
#line 327 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1344,7 +1345,7 @@ finish_script:
break;
case 14:
#line 347 "htmlparse.y"
#line 348 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1368,7 +1369,7 @@ finish_style:
break;
case 15:
#line 368 "htmlparse.y"
#line 369 "htmlparse.y"
{
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
@ -1395,7 +1396,7 @@ finish_characters:
}
/* Line 999 of yacc.c. */
#line 1399 "htmlparse.c"
#line 1400 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1589,7 +1590,7 @@ yyreturn:
}
#line 390 "htmlparse.y"
#line 391 "htmlparse.y"
/* disable python memory interface */
@ -1598,18 +1599,19 @@ yyreturn:
#undef free
/* create parser */
static PyObject* htmlsax_parser(PyObject* self, PyObject* args) {
static PyObject* htmlsax_parser_new(PyObject* self, PyObject* args) {
PyObject* handler;
parser_object* p;
if (!PyArg_ParseTuple(args, "O", &handler)) {
PyErr_SetString(PyExc_TypeError, "SAX2 handler object arg required");
return NULL;
}
Py_INCREF(handler);
if (!(p=PyObject_NEW(parser_object, &parser_type))) {
p = PyObject_New(parser_object, &parser_type);
if (!p) {
PyErr_SetString(PyExc_TypeError, "Allocating parser object failed");
return NULL;
}
Py_INCREF(handler);
/* reset userData */
p->userData = PyMem_New(UserData, sizeof(UserData));
p->userData->handler = handler;
@ -1636,56 +1638,54 @@ static PyObject* htmlsax_parser(PyObject* self, PyObject* args) {
}
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
Py_DECREF(self->userData->handler);
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
PyMem_DEL(self);
static void parser_dealloc (PyObject* self) {
parser_object* p = (parser_object*)self;
htmllexDestroy(p->scanner);
Py_DECREF(p->userData->handler);
PyMem_Del(p->userData->buf);
PyMem_Del(p->userData->tmp_buf);
PyMem_Del(p->userData);
PyMem_DEL(p);
}
/* flush parser buffers, isueing any remaining data as character data */
static PyObject* parser_flush (parser_object* self, PyObject* args) {
static PyObject* parser_flush (PyObject* self, PyObject* args) {
int res=0;
int len = strlen(self->userData->buf);
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
/* update internal parser variables */
if (len > self->userData->bufpos) {
self->userData->pos += len;
}
RESIZE_BUF(self->userData->tmp_buf);
Py_XDECREF(self->userData->tmp_tag);
Py_XDECREF(self->userData->tmp_attrs);
Py_XDECREF(self->userData->tmp_attrval);
Py_XDECREF(self->userData->tmp_attrname);
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
if (len > 0) {
parser_object* p = (parser_object*)self;
/* reset parser variables */
RESIZE_BUF(p->userData->tmp_buf);
Py_XDECREF(p->userData->tmp_tag);
Py_XDECREF(p->userData->tmp_attrs);
Py_XDECREF(p->userData->tmp_attrval);
Py_XDECREF(p->userData->tmp_attrname);
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->userData->bufpos = 0;
if (strlen(p->userData->buf)) {
// XXX set line, col
int error = 0;
PyObject* s = PyString_FromString(self->userData->buf);
PyObject* s = PyString_FromString(p->userData->buf);
PyObject* callback = NULL;
PyObject* result = NULL;
/* reset buffer */
RESIZE_BUF(self->userData->buf);
RESIZE_BUF(p->userData->buf);
if (s==NULL) { error=1; goto finish_flush; }
self->userData->bufpos = self->userData->nextpos = 0;
if (PyObject_HasAttrString(self->userData->handler, "characters")==1) {
callback = PyObject_GetAttrString(self->userData->handler, "characters");
if (PyObject_HasAttrString(p->userData->handler, "characters")==1) {
callback = PyObject_GetAttrString(p->userData->handler, "characters");
if (callback==NULL) { error=1; goto finish_flush; }
result = PyObject_CallFunction(callback, "O", s);
if (result==NULL) { error=1; goto finish_flush; }
}
finish_flush:
Py_XDECREF(s);
Py_XDECREF(callback);
Py_XDECREF(result);
if (error) {
Py_XDECREF(s);
if (error==1) {
return NULL;
}
}
@ -1694,56 +1694,61 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
/* return the current parser line number */
static PyObject* parser_lineno (parser_object* self, PyObject* args) {
static PyObject* parser_lineno (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->lineno);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->lineno);
}
/* return the last parser line number */
static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
static PyObject* parser_last_lineno (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_lineno);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->last_lineno);
}
/* return the current parser column number */
static PyObject* parser_column (parser_object* self, PyObject* args) {
static PyObject* parser_column (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->column);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->column);
}
/* return the last parser column number */
static PyObject* parser_last_column (parser_object* self, PyObject* args) {
static PyObject* parser_last_column (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_column);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->last_column);
}
static PyObject* parser_pos (parser_object* self, PyObject* args) {
static PyObject* parser_pos (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->pos);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->pos);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed (parser_object* self, PyObject* args) {
static PyObject* parser_feed(PyObject* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
@ -1751,22 +1756,23 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
parser_object* p = (parser_object*)self;
if (htmllexStart(p->scanner, p->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(self->scanner)!=0) {
if (self->userData->exc_type!=NULL) {
/* note: we give away these objects, so dont decref */
PyErr_Restore(self->userData->exc_type,
self->userData->exc_val,
self->userData->exc_tb);
if (yyparse(p->scanner)!=0) {
if (p->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(p->userData->exc_type,
p->userData->exc_val,
p->userData->exc_tb);
}
htmllexStop(p->scanner, p->userData);
return NULL;
}
if (htmllexStop(self->scanner, self->userData)!=0) {
if (htmllexStop(p->scanner, p->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
@ -1776,29 +1782,30 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) {
/* reset the parser. This will erase all buffered data! */
static PyObject* parser_reset(parser_object* self, PyObject* args) {
static PyObject* parser_reset(PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
if (htmllexDestroy(self->scanner)!=0) {
parser_object* p = (parser_object*)self;
if (htmllexDestroy(p->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
/* reset buffer */
RESIZE_BUF(self->userData->buf);
RESIZE_BUF(self->userData->tmp_buf);
self->userData->bufpos =
self->userData->pos =
self->userData->nextpos = 0;
self->userData->column =
self->userData->last_column =
self->userData->lineno =
self->userData->last_lineno = 1;
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
RESIZE_BUF(p->userData->buf);
RESIZE_BUF(p->userData->tmp_buf);
p->userData->bufpos =
p->userData->pos =
p->userData->nextpos = 0;
p->userData->column =
p->userData->last_column =
p->userData->lineno =
p->userData->last_lineno = 1;
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->scanner = NULL;
if (htmllexInit(&(p->scanner), p->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
@ -1807,30 +1814,36 @@ static PyObject* parser_reset(parser_object* self, PyObject* args) {
}
/* set the debug level, if its >0, debugging is on, =0 means off */
static PyObject* parser_debug(PyObject* self, PyObject* args) {
int debug;
if (!PyArg_ParseTuple(args, "i", &debug)) {
return NULL;
}
yydebug = debug;
parser_object* p = (parser_object*)self;
debug = htmllexDebug(&(p->scanner), debug);
return PyInt_FromLong((long)debug);
}
/* type interface */
static PyMethodDef parser_methods[] = {
/* incremental parsing */
{"feed", (PyCFunction) parser_feed, METH_VARARGS},
/* reset the parser (no flushing) */
{"reset", (PyCFunction) parser_reset, METH_VARARGS},
/* flush the parser buffers */
{"flush", (PyCFunction) parser_flush, METH_VARARGS},
/* get the current line number */
{"lineno", (PyCFunction) parser_lineno, METH_VARARGS},
/* get the last line number */
{"last_lineno", (PyCFunction) parser_last_lineno, METH_VARARGS},
/* get the current column */
{"column", (PyCFunction) parser_column, METH_VARARGS},
/* get the last column */
{"last_column", (PyCFunction) parser_last_column, METH_VARARGS},
/* get the current scanner position */
{"pos", (PyCFunction) parser_pos, METH_VARARGS},
{"feed", parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", parser_debug, METH_VARARGS, "set debug level"},
{"lineno", parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", parser_column, METH_VARARGS, "get the current column"},
{"last_column", parser_last_column, METH_VARARGS, "get the last column"},
{"pos", parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL, NULL}
};
static PyObject* parser_getattr(parser_object* self, char* name) {
return Py_FindMethod(parser_methods, (PyObject*) self, name);
static PyObject* parser_getattr(PyObject* self, char* name) {
return Py_FindMethod(parser_methods, self, name);
}
@ -1850,7 +1863,8 @@ statichere PyTypeObject parser_type = {
/* python module interface */
static PyMethodDef htmlsax_methods[] = {
{"parser", htmlsax_parser, METH_VARARGS},
{"parser", htmlsax_parser_new, METH_VARARGS,
"Create a new HTML parser object."},
{NULL, NULL}
};

View file

@ -12,14 +12,15 @@
/* extern functions found in htmllex.l */
extern int yylex(YYSTYPE* yylvalp, void* scanner);
extern int htmllexInit (void** scanner, UserData* data);
extern int htmllexDebug (void** scanner, int debug);
extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
extern int htmllexStop (void* scanner, UserData* data);
extern int htmllexDestroy (void* scanner);
extern void* yyget_extra(void*);
extern int yyget_lineno(void*);
#define YYERROR_VERBOSE 1
/* standard error reporting, indicating an internal error */
/* standard error reporting, indicating an internal error */
static int yyerror (char* msg) {
fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
return 0;
@ -98,7 +99,7 @@ staticforward PyTypeObject parser_type;
/* parser options */
%verbose
/*%debug*/
%debug
%defines
%output="htmlparse.c"
%pure_parser
@ -395,18 +396,19 @@ finish_characters:
#undef free
/* create parser */
static PyObject* htmlsax_parser(PyObject* self, PyObject* args) {
static PyObject* htmlsax_parser_new(PyObject* self, PyObject* args) {
PyObject* handler;
parser_object* p;
if (!PyArg_ParseTuple(args, "O", &handler)) {
PyErr_SetString(PyExc_TypeError, "SAX2 handler object arg required");
return NULL;
}
Py_INCREF(handler);
if (!(p=PyObject_NEW(parser_object, &parser_type))) {
p = PyObject_New(parser_object, &parser_type);
if (!p) {
PyErr_SetString(PyExc_TypeError, "Allocating parser object failed");
return NULL;
}
Py_INCREF(handler);
/* reset userData */
p->userData = PyMem_New(UserData, sizeof(UserData));
p->userData->handler = handler;
@ -433,56 +435,54 @@ static PyObject* htmlsax_parser(PyObject* self, PyObject* args) {
}
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
Py_DECREF(self->userData->handler);
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
PyMem_DEL(self);
static void parser_dealloc (PyObject* self) {
parser_object* p = (parser_object*)self;
htmllexDestroy(p->scanner);
Py_DECREF(p->userData->handler);
PyMem_Del(p->userData->buf);
PyMem_Del(p->userData->tmp_buf);
PyMem_Del(p->userData);
PyMem_DEL(p);
}
/* flush parser buffers, isueing any remaining data as character data */
static PyObject* parser_flush (parser_object* self, PyObject* args) {
static PyObject* parser_flush (PyObject* self, PyObject* args) {
int res=0;
int len = strlen(self->userData->buf);
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
/* update internal parser variables */
if (len > self->userData->bufpos) {
self->userData->pos += len;
}
RESIZE_BUF(self->userData->tmp_buf);
Py_XDECREF(self->userData->tmp_tag);
Py_XDECREF(self->userData->tmp_attrs);
Py_XDECREF(self->userData->tmp_attrval);
Py_XDECREF(self->userData->tmp_attrname);
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
if (len > 0) {
parser_object* p = (parser_object*)self;
/* reset parser variables */
RESIZE_BUF(p->userData->tmp_buf);
Py_XDECREF(p->userData->tmp_tag);
Py_XDECREF(p->userData->tmp_attrs);
Py_XDECREF(p->userData->tmp_attrval);
Py_XDECREF(p->userData->tmp_attrname);
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->userData->bufpos = 0;
if (strlen(p->userData->buf)) {
// XXX set line, col
int error = 0;
PyObject* s = PyString_FromString(self->userData->buf);
PyObject* s = PyString_FromString(p->userData->buf);
PyObject* callback = NULL;
PyObject* result = NULL;
/* reset buffer */
RESIZE_BUF(self->userData->buf);
RESIZE_BUF(p->userData->buf);
if (s==NULL) { error=1; goto finish_flush; }
self->userData->bufpos = self->userData->nextpos = 0;
if (PyObject_HasAttrString(self->userData->handler, "characters")==1) {
callback = PyObject_GetAttrString(self->userData->handler, "characters");
if (PyObject_HasAttrString(p->userData->handler, "characters")==1) {
callback = PyObject_GetAttrString(p->userData->handler, "characters");
if (callback==NULL) { error=1; goto finish_flush; }
result = PyObject_CallFunction(callback, "O", s);
if (result==NULL) { error=1; goto finish_flush; }
}
finish_flush:
Py_XDECREF(s);
Py_XDECREF(callback);
Py_XDECREF(result);
if (error) {
Py_XDECREF(s);
if (error==1) {
return NULL;
}
}
@ -491,56 +491,61 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
/* return the current parser line number */
static PyObject* parser_lineno (parser_object* self, PyObject* args) {
static PyObject* parser_lineno (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->lineno);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->lineno);
}
/* return the last parser line number */
static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
static PyObject* parser_last_lineno (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_lineno);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->last_lineno);
}
/* return the current parser column number */
static PyObject* parser_column (parser_object* self, PyObject* args) {
static PyObject* parser_column (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->column);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->column);
}
/* return the last parser column number */
static PyObject* parser_last_column (parser_object* self, PyObject* args) {
static PyObject* parser_last_column (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_column);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->last_column);
}
static PyObject* parser_pos (parser_object* self, PyObject* args) {
static PyObject* parser_pos (PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->pos);
parser_object* p = (parser_object*)self;
return Py_BuildValue("i", p->userData->pos);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed (parser_object* self, PyObject* args) {
static PyObject* parser_feed(PyObject* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
@ -548,22 +553,23 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
parser_object* p = (parser_object*)self;
if (htmllexStart(p->scanner, p->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(self->scanner)!=0) {
if (self->userData->exc_type!=NULL) {
/* note: we give away these objects, so dont decref */
PyErr_Restore(self->userData->exc_type,
self->userData->exc_val,
self->userData->exc_tb);
if (yyparse(p->scanner)!=0) {
if (p->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(p->userData->exc_type,
p->userData->exc_val,
p->userData->exc_tb);
}
htmllexStop(p->scanner, p->userData);
return NULL;
}
if (htmllexStop(self->scanner, self->userData)!=0) {
if (htmllexStop(p->scanner, p->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
@ -573,29 +579,30 @@ static PyObject* parser_feed (parser_object* self, PyObject* args) {
/* reset the parser. This will erase all buffered data! */
static PyObject* parser_reset(parser_object* self, PyObject* args) {
static PyObject* parser_reset(PyObject* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
if (htmllexDestroy(self->scanner)!=0) {
parser_object* p = (parser_object*)self;
if (htmllexDestroy(p->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
/* reset buffer */
RESIZE_BUF(self->userData->buf);
RESIZE_BUF(self->userData->tmp_buf);
self->userData->bufpos =
self->userData->pos =
self->userData->nextpos = 0;
self->userData->column =
self->userData->last_column =
self->userData->lineno =
self->userData->last_lineno = 1;
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
RESIZE_BUF(p->userData->buf);
RESIZE_BUF(p->userData->tmp_buf);
p->userData->bufpos =
p->userData->pos =
p->userData->nextpos = 0;
p->userData->column =
p->userData->last_column =
p->userData->lineno =
p->userData->last_lineno = 1;
p->userData->tmp_tag = p->userData->tmp_attrs =
p->userData->tmp_attrval = p->userData->tmp_attrname = NULL;
p->scanner = NULL;
if (htmllexInit(&(p->scanner), p->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
@ -604,30 +611,36 @@ static PyObject* parser_reset(parser_object* self, PyObject* args) {
}
/* set the debug level, if its >0, debugging is on, =0 means off */
static PyObject* parser_debug(PyObject* self, PyObject* args) {
int debug;
if (!PyArg_ParseTuple(args, "i", &debug)) {
return NULL;
}
yydebug = debug;
parser_object* p = (parser_object*)self;
debug = htmllexDebug(&(p->scanner), debug);
return PyInt_FromLong((long)debug);
}
/* type interface */
static PyMethodDef parser_methods[] = {
/* incremental parsing */
{"feed", (PyCFunction) parser_feed, METH_VARARGS},
/* reset the parser (no flushing) */
{"reset", (PyCFunction) parser_reset, METH_VARARGS},
/* flush the parser buffers */
{"flush", (PyCFunction) parser_flush, METH_VARARGS},
/* get the current line number */
{"lineno", (PyCFunction) parser_lineno, METH_VARARGS},
/* get the last line number */
{"last_lineno", (PyCFunction) parser_last_lineno, METH_VARARGS},
/* get the current column */
{"column", (PyCFunction) parser_column, METH_VARARGS},
/* get the last column */
{"last_column", (PyCFunction) parser_last_column, METH_VARARGS},
/* get the current scanner position */
{"pos", (PyCFunction) parser_pos, METH_VARARGS},
{"feed", parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", parser_debug, METH_VARARGS, "set debug level"},
{"lineno", parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", parser_column, METH_VARARGS, "get the current column"},
{"last_column", parser_last_column, METH_VARARGS, "get the last column"},
{"pos", parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL, NULL}
};
static PyObject* parser_getattr(parser_object* self, char* name) {
return Py_FindMethod(parser_methods, (PyObject*) self, name);
static PyObject* parser_getattr(PyObject* self, char* name) {
return Py_FindMethod(parser_methods, self, name);
}
@ -647,7 +660,8 @@ statichere PyTypeObject parser_type = {
/* python module interface */
static PyMethodDef htmlsax_methods[] = {
{"parser", htmlsax_parser, METH_VARARGS},
{"parser", htmlsax_parser_new, METH_VARARGS,
"Create a new HTML parser object."},
{NULL, NULL}
};