mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-29 18:44:43 +00:00
decode input strings, and return unicode strings
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1854 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
10209ae499
commit
94c605d476
2 changed files with 275 additions and 218 deletions
File diff suppressed because it is too large
Load diff
|
|
@ -191,6 +191,7 @@ static PyObject* list_dict;
|
|||
typedef struct {
|
||||
PyObject_HEAD
|
||||
PyObject* handler;
|
||||
PyObject* encoding;
|
||||
UserData* userData;
|
||||
void* scanner;
|
||||
} parser_object;
|
||||
|
|
@ -230,7 +231,7 @@ typedef int YYSTYPE;
|
|||
|
||||
|
||||
/* Line 214 of yacc.c. */
|
||||
#line 234 "htmlparse.c"
|
||||
#line 235 "htmlparse.c"
|
||||
|
||||
#if ! defined (yyoverflow) || YYERROR_VERBOSE
|
||||
|
||||
|
|
@ -400,8 +401,8 @@ static const yysigned_char yyrhs[] =
|
|||
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
|
||||
static const unsigned short yyrline[] =
|
||||
{
|
||||
0, 144, 144, 145, 148, 149, 156, 191, 238, 269,
|
||||
290, 311, 332, 353, 375, 397
|
||||
0, 145, 145, 146, 149, 150, 157, 192, 242, 276,
|
||||
297, 318, 339, 360, 385, 410
|
||||
};
|
||||
#endif
|
||||
|
||||
|
|
@ -1106,22 +1107,22 @@ yyreduce:
|
|||
switch (yyn)
|
||||
{
|
||||
case 2:
|
||||
#line 144 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
#line 145 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
#line 146 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 4:
|
||||
#line 148 "htmlparse.y"
|
||||
#line 149 "htmlparse.y"
|
||||
{ YYACCEPT; /* wait for more lexer input */ ;}
|
||||
break;
|
||||
|
||||
case 5:
|
||||
#line 150 "htmlparse.y"
|
||||
#line 151 "htmlparse.y"
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1131,10 +1132,10 @@ yyreduce:
|
|||
break;
|
||||
|
||||
case 6:
|
||||
#line 157 "htmlparse.y"
|
||||
#line 158 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1169,17 +1170,17 @@ finish_start:
|
|||
break;
|
||||
|
||||
case 7:
|
||||
#line 192 "htmlparse.y"
|
||||
#line 193 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
PyObject* tag = PyTuple_GET_ITEM(yyvsp[0], 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM(yyvsp[0], 1);
|
||||
int error = 0;
|
||||
char* tagname;
|
||||
PyObject* tagname = NULL;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "start_element");
|
||||
|
|
@ -1190,9 +1191,11 @@ finish_start:
|
|||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
tagname = PyString_AS_STRING(tag);
|
||||
/* encode tagname in ASCII, ignoring any unknown chars */
|
||||
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
|
||||
if (tagname==NULL) { error=1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
NO_HTML_END_TAG(PyString_AsString(tagname))) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "end_element");
|
||||
if (callback==NULL) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "O", tag);
|
||||
|
|
@ -1208,6 +1211,7 @@ finish_start_end:
|
|||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(tag);
|
||||
Py_XDECREF(tagname);
|
||||
Py_XDECREF(attrs);
|
||||
Py_DECREF(yyvsp[0]);
|
||||
if (error) {
|
||||
|
|
@ -1219,16 +1223,18 @@ finish_start_end:
|
|||
break;
|
||||
|
||||
case 8:
|
||||
#line 239 "htmlparse.y"
|
||||
#line 243 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
char* tagname = PyString_AS_STRING(yyvsp[0]);
|
||||
/* encode tagname in ASCII, ignoring any unknown chars */
|
||||
PyObject* tagname = PyUnicode_AsEncodedString(yyvsp[0], "ascii", "ignore");
|
||||
if (tagname==NULL) { error=1; goto finish_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
NO_HTML_END_TAG(PyString_AsString(tagname))) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "end_element");
|
||||
if (callback==NULL) { error=1; goto finish_end; }
|
||||
result = PyObject_CallFunction(callback, "O", yyvsp[0]);
|
||||
|
|
@ -1241,6 +1247,7 @@ finish_start_end:
|
|||
finish_end:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(tagname);
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF(yyvsp[0]);
|
||||
|
|
@ -1253,9 +1260,9 @@ finish_end:
|
|||
break;
|
||||
|
||||
case 9:
|
||||
#line 270 "htmlparse.y"
|
||||
#line 277 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1277,9 +1284,9 @@ finish_comment:
|
|||
break;
|
||||
|
||||
case 10:
|
||||
#line 291 "htmlparse.y"
|
||||
#line 298 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1301,9 +1308,9 @@ finish_pi:
|
|||
break;
|
||||
|
||||
case 11:
|
||||
#line 312 "htmlparse.y"
|
||||
#line 319 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1325,9 +1332,9 @@ finish_cdata:
|
|||
break;
|
||||
|
||||
case 12:
|
||||
#line 333 "htmlparse.y"
|
||||
#line 340 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1349,20 +1356,23 @@ finish_doctype:
|
|||
break;
|
||||
|
||||
case 13:
|
||||
#line 354 "htmlparse.y"
|
||||
#line 361 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
|
||||
if (script==NULL) { error=1; goto finish_script; }
|
||||
CALLBACK(ud, "characters", "O", yyvsp[0], finish_script);
|
||||
CALLBACK(ud, "end_element", "s", "script", finish_script);
|
||||
CALLBACK(ud, "end_element", "O", script, finish_script);
|
||||
CHECK_ERROR(ud, finish_script);
|
||||
finish_script:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(script);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF(yyvsp[0]);
|
||||
if (error) {
|
||||
|
|
@ -1374,20 +1384,23 @@ finish_script:
|
|||
break;
|
||||
|
||||
case 14:
|
||||
#line 376 "htmlparse.y"
|
||||
#line 386 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
|
||||
if (style==NULL) { error=1; goto finish_style; }
|
||||
CALLBACK(ud, "characters", "O", yyvsp[0], finish_style);
|
||||
CALLBACK(ud, "end_element", "s", "style", finish_style);
|
||||
CALLBACK(ud, "end_element", "O", style, finish_style);
|
||||
CHECK_ERROR(ud, finish_style);
|
||||
finish_style:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(style);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF(yyvsp[0]);
|
||||
if (error) {
|
||||
|
|
@ -1399,9 +1412,9 @@ finish_style:
|
|||
break;
|
||||
|
||||
case 15:
|
||||
#line 398 "htmlparse.y"
|
||||
#line 411 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1427,7 +1440,7 @@ finish_characters:
|
|||
}
|
||||
|
||||
/* Line 999 of yacc.c. */
|
||||
#line 1431 "htmlparse.c"
|
||||
#line 1444 "htmlparse.c"
|
||||
|
||||
yyvsp -= yylen;
|
||||
yyssp -= yylen;
|
||||
|
|
@ -1621,7 +1634,7 @@ yyreturn:
|
|||
}
|
||||
|
||||
|
||||
#line 421 "htmlparse.y"
|
||||
#line 434 "htmlparse.y"
|
||||
|
||||
|
||||
/* disable python memory interface */
|
||||
|
|
@ -1632,16 +1645,15 @@ yyreturn:
|
|||
/* create parser object */
|
||||
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
|
||||
parser_object* self;
|
||||
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
|
||||
{
|
||||
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(Py_None);
|
||||
self->handler = Py_None;
|
||||
/* reset userData */
|
||||
self->userData = PyMem_New(UserData, sizeof(UserData));
|
||||
if (self->userData == NULL)
|
||||
{
|
||||
if (self->userData == NULL) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -1667,11 +1679,18 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
self->userData->exc_tb = NULL;
|
||||
self->userData->error = NULL;
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0)
|
||||
{
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->encoding = PyString_FromString("iso8859-1");
|
||||
if (self->encoding == NULL) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->userData->encoding = self->encoding;
|
||||
return (PyObject*) self;
|
||||
}
|
||||
|
||||
|
|
@ -1705,9 +1724,9 @@ static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
|
|||
|
||||
/* clear all used subobjects participating in reference cycles */
|
||||
static int parser_clear (parser_object* self) {
|
||||
self->userData->handler = NULL;
|
||||
Py_XDECREF(self->handler);
|
||||
self->handler = NULL;
|
||||
self->userData->handler = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -1716,6 +1735,9 @@ static int parser_clear (parser_object* self) {
|
|||
static void parser_dealloc (parser_object* self) {
|
||||
htmllexDestroy(self->scanner);
|
||||
parser_clear(self);
|
||||
self->userData->encoding = NULL;
|
||||
Py_XDECREF(self->encoding);
|
||||
self->encoding = NULL;
|
||||
PyMem_Del(self->userData->buf);
|
||||
PyMem_Del(self->userData->tmp_buf);
|
||||
PyMem_Del(self->userData);
|
||||
|
|
@ -1774,7 +1796,10 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
|
|||
if (strlen(self->userData->buf)) {
|
||||
/* XXX set line, col */
|
||||
int error = 0;
|
||||
PyObject* s = PyString_FromString(self->userData->buf);
|
||||
const char* enc = PyString_AsString(self->encoding);
|
||||
PyObject* s = PyUnicode_Decode(self->userData->buf,
|
||||
strlen(self->userData->buf),
|
||||
enc, "ignore");
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
/* reset buffer */
|
||||
|
|
@ -1905,6 +1930,7 @@ static PyObject* parser_gethandler (parser_object* self, void* closure) {
|
|||
return self->handler;
|
||||
}
|
||||
|
||||
|
||||
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
|
||||
|
|
@ -1913,10 +1939,34 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
|
|||
Py_DECREF(self->handler);
|
||||
Py_INCREF(value);
|
||||
self->handler = value;
|
||||
self->userData->handler = self->handler;
|
||||
self->userData->handler = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static PyObject* parser_getencoding (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->encoding);
|
||||
return self->encoding;
|
||||
}
|
||||
|
||||
|
||||
static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
|
||||
return -1;
|
||||
}
|
||||
if (!PyString_Check(value)) {
|
||||
PyErr_SetString(PyExc_TypeError, "encoding must be string");
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(self->encoding);
|
||||
Py_INCREF(value);
|
||||
self->encoding = value;
|
||||
self->userData->encoding = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* type interface */
|
||||
|
||||
static PyMemberDef parser_members[] = {
|
||||
|
|
@ -1926,19 +1976,21 @@ static PyMemberDef parser_members[] = {
|
|||
static PyGetSetDef parser_getset[] = {
|
||||
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
|
||||
"handler object", NULL},
|
||||
{"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
|
||||
"encoding", NULL},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyMethodDef parser_methods[] = {
|
||||
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
|
||||
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
|
||||
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
|
||||
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
|
||||
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
|
||||
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
|
||||
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
|
||||
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
|
||||
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
|
||||
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
|
||||
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
|
||||
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
|
||||
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue