decode input strings, and return unicode strings

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1854 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-10-07 15:47:09 +00:00
parent 10209ae499
commit 94c605d476
2 changed files with 275 additions and 218 deletions

File diff suppressed because it is too large Load diff

View file

@ -191,6 +191,7 @@ static PyObject* list_dict;
typedef struct {
PyObject_HEAD
PyObject* handler;
PyObject* encoding;
UserData* userData;
void* scanner;
} parser_object;
@ -230,7 +231,7 @@ typedef int YYSTYPE;
/* Line 214 of yacc.c. */
#line 234 "htmlparse.c"
#line 235 "htmlparse.c"
#if ! defined (yyoverflow) || YYERROR_VERBOSE
@ -400,8 +401,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short yyrline[] =
{
0, 144, 144, 145, 148, 149, 156, 191, 238, 269,
290, 311, 332, 353, 375, 397
0, 145, 145, 146, 149, 150, 157, 192, 242, 276,
297, 318, 339, 360, 385, 410
};
#endif
@ -1106,22 +1107,22 @@ yyreduce:
switch (yyn)
{
case 2:
#line 144 "htmlparse.y"
{;}
break;
case 3:
#line 145 "htmlparse.y"
{;}
break;
case 3:
#line 146 "htmlparse.y"
{;}
break;
case 4:
#line 148 "htmlparse.y"
#line 149 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
break;
case 5:
#line 150 "htmlparse.y"
#line 151 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1131,10 +1132,10 @@ yyreduce:
break;
case 6:
#line 157 "htmlparse.y"
#line 158 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
<tag> is a PyObject, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1169,17 +1170,17 @@ finish_start:
break;
case 7:
#line 192 "htmlparse.y"
#line 193 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
<tag> is a PyObject, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
PyObject* tag = PyTuple_GET_ITEM(yyvsp[0], 0);
PyObject* attrs = PyTuple_GET_ITEM(yyvsp[0], 1);
int error = 0;
char* tagname;
PyObject* tagname = NULL;
if (!tag || !attrs) { error = 1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
callback = PyObject_GetAttrString(ud->handler, "start_element");
@ -1190,9 +1191,11 @@ finish_start:
Py_DECREF(result);
callback=result=NULL;
}
tagname = PyString_AS_STRING(tag);
/* encode tagname in ASCII, ignoring any unknown chars */
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
NO_HTML_END_TAG(tagname)) {
NO_HTML_END_TAG(PyString_AsString(tagname))) {
callback = PyObject_GetAttrString(ud->handler, "end_element");
if (callback==NULL) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "O", tag);
@ -1208,6 +1211,7 @@ finish_start_end:
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(tag);
Py_XDECREF(tagname);
Py_XDECREF(attrs);
Py_DECREF(yyvsp[0]);
if (error) {
@ -1219,16 +1223,18 @@ finish_start_end:
break;
case 8:
#line 239 "htmlparse.y"
#line 243 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
char* tagname = PyString_AS_STRING(yyvsp[0]);
/* encode tagname in ASCII, ignoring any unknown chars */
PyObject* tagname = PyUnicode_AsEncodedString(yyvsp[0], "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_end; }
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
NO_HTML_END_TAG(tagname)) {
NO_HTML_END_TAG(PyString_AsString(tagname))) {
callback = PyObject_GetAttrString(ud->handler, "end_element");
if (callback==NULL) { error=1; goto finish_end; }
result = PyObject_CallFunction(callback, "O", yyvsp[0]);
@ -1241,6 +1247,7 @@ finish_start_end:
finish_end:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(tagname);
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF(yyvsp[0]);
@ -1253,9 +1260,9 @@ finish_end:
break;
case 9:
#line 270 "htmlparse.y"
#line 277 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1277,9 +1284,9 @@ finish_comment:
break;
case 10:
#line 291 "htmlparse.y"
#line 298 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1301,9 +1308,9 @@ finish_pi:
break;
case 11:
#line 312 "htmlparse.y"
#line 319 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1325,9 +1332,9 @@ finish_cdata:
break;
case 12:
#line 333 "htmlparse.y"
#line 340 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1349,20 +1356,23 @@ finish_doctype:
break;
case 13:
#line 354 "htmlparse.y"
#line 361 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
if (script==NULL) { error=1; goto finish_script; }
CALLBACK(ud, "characters", "O", yyvsp[0], finish_script);
CALLBACK(ud, "end_element", "s", "script", finish_script);
CALLBACK(ud, "end_element", "O", script, finish_script);
CHECK_ERROR(ud, finish_script);
finish_script:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(script);
Py_XDECREF(result);
Py_DECREF(yyvsp[0]);
if (error) {
@ -1374,20 +1384,23 @@ finish_script:
break;
case 14:
#line 376 "htmlparse.y"
#line 386 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
if (style==NULL) { error=1; goto finish_style; }
CALLBACK(ud, "characters", "O", yyvsp[0], finish_style);
CALLBACK(ud, "end_element", "s", "style", finish_style);
CALLBACK(ud, "end_element", "O", style, finish_style);
CHECK_ERROR(ud, finish_style);
finish_style:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(style);
Py_XDECREF(result);
Py_DECREF(yyvsp[0]);
if (error) {
@ -1399,9 +1412,9 @@ finish_style:
break;
case 15:
#line 398 "htmlparse.y"
#line 411 "htmlparse.y"
{
/* $1 is a PyString */
/* $1 is a PyUnicode */
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1427,7 +1440,7 @@ finish_characters:
}
/* Line 999 of yacc.c. */
#line 1431 "htmlparse.c"
#line 1444 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1621,7 +1634,7 @@ yyreturn:
}
#line 421 "htmlparse.y"
#line 434 "htmlparse.y"
/* disable python memory interface */
@ -1632,16 +1645,15 @@ yyreturn:
/* create parser object */
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
parser_object* self;
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
{
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL) {
return NULL;
}
Py_INCREF(Py_None);
self->handler = Py_None;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
if (self->userData == NULL)
{
if (self->userData == NULL) {
Py_DECREF(self->handler);
Py_DECREF(self);
return NULL;
}
@ -1667,11 +1679,18 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
self->userData->exc_tb = NULL;
self->userData->error = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0)
{
if (htmllexInit(&(self->scanner), self->userData)!=0) {
Py_DECREF(self->handler);
Py_DECREF(self);
return NULL;
}
self->encoding = PyString_FromString("iso8859-1");
if (self->encoding == NULL) {
Py_DECREF(self->handler);
Py_DECREF(self);
return NULL;
}
self->userData->encoding = self->encoding;
return (PyObject*) self;
}
@ -1705,9 +1724,9 @@ static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
/* clear all used subobjects participating in reference cycles */
static int parser_clear (parser_object* self) {
self->userData->handler = NULL;
Py_XDECREF(self->handler);
self->handler = NULL;
self->userData->handler = NULL;
return 0;
}
@ -1716,6 +1735,9 @@ static int parser_clear (parser_object* self) {
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
parser_clear(self);
self->userData->encoding = NULL;
Py_XDECREF(self->encoding);
self->encoding = NULL;
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
@ -1774,7 +1796,10 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
if (strlen(self->userData->buf)) {
/* XXX set line, col */
int error = 0;
PyObject* s = PyString_FromString(self->userData->buf);
const char* enc = PyString_AsString(self->encoding);
PyObject* s = PyUnicode_Decode(self->userData->buf,
strlen(self->userData->buf),
enc, "ignore");
PyObject* callback = NULL;
PyObject* result = NULL;
/* reset buffer */
@ -1905,6 +1930,7 @@ static PyObject* parser_gethandler (parser_object* self, void* closure) {
return self->handler;
}
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
@ -1913,10 +1939,34 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
Py_DECREF(self->handler);
Py_INCREF(value);
self->handler = value;
self->userData->handler = self->handler;
self->userData->handler = value;
return 0;
}
static PyObject* parser_getencoding (parser_object* self, void* closure) {
Py_INCREF(self->encoding);
return self->encoding;
}
static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
return -1;
}
if (!PyString_Check(value)) {
PyErr_SetString(PyExc_TypeError, "encoding must be string");
return -1;
}
Py_DECREF(self->encoding);
Py_INCREF(value);
self->encoding = value;
self->userData->encoding = value;
return 0;
}
/* type interface */
static PyMemberDef parser_members[] = {
@ -1926,19 +1976,21 @@ static PyMemberDef parser_members[] = {
static PyGetSetDef parser_getset[] = {
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
"handler object", NULL},
{"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
"encoding", NULL},
{NULL} /* Sentinel */
};
static PyMethodDef parser_methods[] = {
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL} /* Sentinel */
};