*** empty log message ***

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2109 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-12-08 09:10:30 +00:00
parent 4fbdbe3a51
commit 793862d9af
2 changed files with 2078 additions and 1945 deletions

File diff suppressed because it is too large Load diff

View file

@ -131,25 +131,15 @@ static int yyerror (char* msg) {
/* parser.resolve_entities */
static PyObject* resolve_entities;
/* ListDict class, sorted dictionary */
static PyObject* list_dict;
/* set_encoding helper function */
static PyObject* set_encoding;
/* set_doctype helper function */
static PyObject* set_doctype;
/* macros for easier scanner state manipulation */
/* test whether tag does not need an HTML end tag */
#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
strcmp(tag, "base")==0 || \
strcmp(tag, "basefont")==0 || \
strcmp(tag, "br")==0 || \
strcmp(tag, "col")==0 || \
strcmp(tag, "frame")==0 || \
strcmp(tag, "hr")==0 || \
strcmp(tag, "img")==0 || \
strcmp(tag, "input")==0 || \
strcmp(tag, "isindex")==0 || \
strcmp(tag, "link")==0 || \
strcmp(tag, "meta")==0 || \
strcmp(tag, "param")==0)
/* clear buffer b, returning NULL on error */
#define CLEAR_BUF(b) \
b = PyMem_Resize(b, char, 1); \
@ -190,8 +180,12 @@ static PyObject* list_dict;
/* parser type definition */
typedef struct {
PyObject_HEAD
/* the handler object */
PyObject* handler;
/* the charset encoding (PyStringObject) */
PyObject* encoding;
/* the document type (PyStringObject) */
PyObject* doctype;
UserData* userData;
void* scanner;
} parser_object;
@ -202,6 +196,37 @@ staticforward PyTypeObject parser_type;
#define YYMALLOC PyMem_Malloc
#define YYFREE PyMem_Free
/* Test whether tag does not need an HTML end tag. Return -1 on error. */
static int html_end_tag (PyObject* ptag, PyObject* parser) {
PyObject* pdoctype;
char* doctype;
int ret = 1;
pdoctype = PyObject_GetAttrString(parser, "doctype");
if (pdoctype==NULL) return -1;
doctype = PyString_AsString(pdoctype);
if (doctype == NULL) { Py_DECREF(pdoctype); return -1; }
if (strcmp(doctype, "HTML")==0) {
char* tag = PyString_AsString(ptag);
if (tag == NULL) { Py_DECREF(pdoctype); return -1; }
ret = strcmp(tag, "area")!=0 &&
strcmp(tag, "base")!=0 &&
strcmp(tag, "basefont")!=0 &&
strcmp(tag, "br")!=0 &&
strcmp(tag, "col")!=0 &&
strcmp(tag, "frame")!=0 &&
strcmp(tag, "hr")!=0 &&
strcmp(tag, "img")!=0 &&
strcmp(tag, "input")!=0 &&
strcmp(tag, "isindex")!=0 &&
strcmp(tag, "link")!=0 &&
strcmp(tag, "meta")!=0 &&
strcmp(tag, "param")!=0;
}
/* it is not HTML (presumably XHTML) */
Py_DECREF(pdoctype);
return ret;
}
/* Enabling traces. */
@ -230,7 +255,7 @@ typedef int YYSTYPE;
/* Line 214 of yacc.c. */
#line 234 "htmlparse.c"
#line 259 "htmlparse.c"
#if ! defined (yyoverflow) || YYERROR_VERBOSE
@ -407,8 +432,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short int yyrline[] =
{
0, 144, 144, 145, 148, 149, 156, 191, 241, 275,
296, 317, 338, 359, 384, 409
0, 169, 169, 170, 173, 174, 181, 220, 267, 301,
322, 343, 364, 389, 414, 439
};
#endif
@ -1114,22 +1139,22 @@ yyreduce:
switch (yyn)
{
case 2:
#line 144 "htmlparse.y"
#line 169 "htmlparse.y"
{;}
break;
case 3:
#line 145 "htmlparse.y"
#line 170 "htmlparse.y"
{;}
break;
case 4:
#line 148 "htmlparse.y"
#line 173 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
break;
case 5:
#line 150 "htmlparse.y"
#line 175 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1139,7 +1164,7 @@ yyreduce:
break;
case 6:
#line 157 "htmlparse.y"
#line 182 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a PyDict */
@ -1149,7 +1174,11 @@ yyreduce:
PyObject* tag = PyTuple_GET_ITEM(yyvsp[0], 0);
PyObject* attrs = PyTuple_GET_ITEM(yyvsp[0], 1);
int error = 0;
if (!tag || !attrs) { error = 1; goto finish_start; }
if (tag==NULL || attrs==NULL) { error = 1; goto finish_start; }
/* set encoding */
result = PyObject_CallFunction(set_encoding, "OOO", ud->parser, tag, attrs);
if (result==NULL) { error=1; goto finish_start; }
Py_DECREF(result); result = NULL;
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
callback = PyObject_GetAttrString(ud->handler, "start_element");
if (!callback) { error=1; goto finish_start; }
@ -1157,7 +1186,7 @@ yyreduce:
if (!result) { error=1; goto finish_start; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
callback = result = NULL;
}
CHECK_ERROR(ud, finish_start);
finish_start:
@ -1177,7 +1206,7 @@ finish_start:
break;
case 7:
#line 192 "htmlparse.y"
#line 221 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a PyDict */
@ -1187,29 +1216,27 @@ finish_start:
PyObject* tag = PyTuple_GET_ITEM(yyvsp[0], 0);
PyObject* attrs = PyTuple_GET_ITEM(yyvsp[0], 1);
int error = 0;
PyObject* tagname = NULL;
if (!tag || !attrs) { error = 1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
callback = PyObject_GetAttrString(ud->handler, "start_element");
char* fname;
PyObject* tagname;
if (tag==NULL || attrs==NULL) { error = 1; goto finish_start_end; }
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_start_end; }
/* set encoding */
result = PyObject_CallFunction(set_encoding, "OOO", ud->parser, tag, attrs);
if (result==NULL) { error=1; goto finish_start_end; }
Py_DECREF(result); result = NULL;
if (html_end_tag(tagname, ud->parser))
fname = "start_end_element";
else
fname = "start_element";
if (PyObject_HasAttrString(ud->handler, fname)==1) {
callback = PyObject_GetAttrString(ud->handler, fname);
if (!callback) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "OO", tag, attrs);
if (!result) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
/* encode tagname in ASCII, ignoring any unknown chars */
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
NO_HTML_END_TAG(PyString_AsString(tagname))) {
callback = PyObject_GetAttrString(ud->handler, "end_element");
if (callback==NULL) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "O", tag);
if (result==NULL) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
callback = result = NULL;
}
CHECK_ERROR(ud, finish_start_end);
finish_start_end:
@ -1218,7 +1245,6 @@ finish_start_end:
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(tag);
Py_XDECREF(tagname);
Py_XDECREF(attrs);
Py_DECREF(yyvsp[0]);
if (error) {
@ -1230,7 +1256,7 @@ finish_start_end:
break;
case 8:
#line 242 "htmlparse.y"
#line 268 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
@ -1241,14 +1267,14 @@ finish_start_end:
PyObject* tagname = PyUnicode_AsEncodedString(yyvsp[0], "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_end; }
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
NO_HTML_END_TAG(PyString_AsString(tagname))) {
html_end_tag(tagname, ud->parser)) {
callback = PyObject_GetAttrString(ud->handler, "end_element");
if (callback==NULL) { error=1; goto finish_end; }
result = PyObject_CallFunction(callback, "O", yyvsp[0]);
if (result==NULL) { error=1; goto finish_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
callback = result = NULL;
}
CHECK_ERROR(ud, finish_end);
finish_end:
@ -1267,7 +1293,7 @@ finish_end:
break;
case 9:
#line 276 "htmlparse.y"
#line 302 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
@ -1291,7 +1317,7 @@ finish_comment:
break;
case 10:
#line 297 "htmlparse.y"
#line 323 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
@ -1315,7 +1341,7 @@ finish_pi:
break;
case 11:
#line 318 "htmlparse.y"
#line 344 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
@ -1339,13 +1365,17 @@ finish_cdata:
break;
case 12:
#line 339 "htmlparse.y"
#line 365 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
/* set encoding */
result = PyObject_CallFunction(set_doctype, "OO", ud->parser, yyvsp[0]);
if (result==NULL) { error=1; goto finish_doctype; }
Py_DECREF(result); result = NULL;
CALLBACK(ud, "doctype", "O", yyvsp[0], finish_doctype);
CHECK_ERROR(ud, finish_doctype);
finish_doctype:
@ -1363,7 +1393,7 @@ finish_doctype:
break;
case 13:
#line 360 "htmlparse.y"
#line 390 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
@ -1391,7 +1421,7 @@ finish_script:
break;
case 14:
#line 385 "htmlparse.y"
#line 415 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
@ -1419,7 +1449,7 @@ finish_style:
break;
case 15:
#line 410 "htmlparse.y"
#line 440 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* Remember this is also called as a lexer error fallback */
@ -1447,7 +1477,7 @@ finish_characters:
}
/* Line 1010 of yacc.c. */
#line 1451 "htmlparse.c"
#line 1481 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1672,7 +1702,7 @@ yyreturn:
}
#line 433 "htmlparse.y"
#line 463 "htmlparse.y"
/* create parser object */
@ -1723,7 +1753,14 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
Py_DECREF(self);
return NULL;
}
self->userData->encoding = self->encoding;
self->doctype = PyString_FromString("HTML");
if (self->doctype == NULL) {
Py_DECREF(self->encoding);
Py_DECREF(self->handler);
Py_DECREF(self);
return NULL;
}
self->userData->parser = (PyObject*)self;
return (PyObject*) self;
}
@ -1768,9 +1805,11 @@ static int parser_clear (parser_object* self) {
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
parser_clear(self);
self->userData->encoding = NULL;
self->userData->parser = NULL;
Py_XDECREF(self->encoding);
self->encoding = NULL;
Py_XDECREF(self->doctype);
self->doctype = NULL;
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
@ -1995,7 +2034,28 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
Py_DECREF(self->encoding);
Py_INCREF(value);
self->encoding = value;
self->userData->encoding = value;
return 0;
}
static PyObject* parser_getdoctype (parser_object* self, void* closure) {
Py_INCREF(self->doctype);
return self->doctype;
}
static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
return -1;
}
if (!PyString_Check(value)) {
PyErr_SetString(PyExc_TypeError, "doctype must be string");
return -1;
}
Py_DECREF(self->doctype);
Py_INCREF(value);
self->doctype = value;
return 0;
}
@ -2011,6 +2071,8 @@ static PyGetSetDef parser_getset[] = {
"handler object", NULL},
{"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
"encoding", NULL},
{"doctype", (getter)parser_getdoctype, (setter)parser_setdoctype,
"doctype", NULL},
{NULL} /* Sentinel */
};
@ -2129,6 +2191,17 @@ PyMODINIT_FUNC inithtmlsax (void) {
Py_DECREF(m);
return;
}
if ((set_encoding = PyObject_GetAttrString(m, "set_encoding"))==NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(m);
return;
}
if ((set_doctype = PyObject_GetAttrString(m, "set_doctype"))==NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(set_encoding);
Py_DECREF(m);
return;
}
Py_DECREF(m);
if ((m = PyImport_ImportModule("linkcheck.containers"))==NULL) {
return;