updated documentation

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2730 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-07-11 14:37:52 +00:00
parent e5197d429a
commit 3ce6aadfd6
3 changed files with 129 additions and 88 deletions

View file

@ -57,6 +57,35 @@ Fast HTML parser module written in C with the following features:
4. Character encoding aware
The parser itself is not encoding aware, but all the output are
always Python Unicode strings.
USAGE
First make a HTML SAX handler object. Used callbacks (they don't have to
be defined) of a handler are:
comment(data): <!--data-->
start_element(tag, attrs): <tag {attr1:value1, attr2:value2, ..}>
start_end_element(tag, attrs): <tag {attr1:value1, attr2:value2, ..}>
end_element(tag): </tag>
doctype(data): <!DOCTYPE data?>
pi(name, data=None): <?name data?>
cdata(data): <![CDATA[data]]>
characters(data): data
Additionally, there are error and warning callbacks:
warning(msg)
error(msg)
fatal_error(msg)
Then create a new HTML parser object with the handler as parameter.
EXAMPLE
# prints out the parsed HTML
handler = HtmlParser.htmllib.HtmlPrettyPrinter()
parser = HtmlParser.htmlsax.parser(handler)
parser.feed("<html><body>Blubb</body></html>")
parser.flush()
*/
"""
import re

View file

@ -452,8 +452,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short int yyrline[] =
{
0, 189, 189, 190, 193, 194, 201, 244, 294, 332,
353, 374, 395, 420, 445, 470
0, 189, 189, 192, 197, 201, 208, 252, 303, 342,
364, 385, 407, 433, 460, 487
};
#endif
@ -1160,21 +1160,28 @@ yyreduce:
{
case 2:
#line 189 "htmlparse.y"
{;}
{
/* parse a single element */
;}
break;
case 3:
#line 190 "htmlparse.y"
{;}
#line 192 "htmlparse.y"
{
/* parse a list of elements */
;}
break;
case 4:
#line 193 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
#line 197 "htmlparse.y"
{
/* wait for more lexer input */
YYACCEPT;
;}
break;
case 5:
#line 195 "htmlparse.y"
#line 202 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1184,10 +1191,11 @@ yyreduce:
break;
case 6:
#line 202 "htmlparse.y"
#line 209 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a PyDict */
/* parsed HTML start tag (eg. <a href="blubb">)
$1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a ListDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1230,10 +1238,11 @@ finish_start:
break;
case 7:
#line 245 "htmlparse.y"
#line 253 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a PyDict */
/* parsed HTML start-end tag (eg. <br/>)
$1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a ListDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1283,9 +1292,10 @@ finish_start_end:
break;
case 8:
#line 295 "htmlparse.y"
#line 304 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* parsed HTML end tag (eg. </b>)
$1 is a PyUnicode with the tag name */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1324,9 +1334,10 @@ finish_end:
break;
case 9:
#line 333 "htmlparse.y"
#line 343 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* parsed HTML comment (eg. <!-- bla -->)
$1 is a PyUnicode with the comment content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1348,7 +1359,7 @@ finish_comment:
break;
case 10:
#line 354 "htmlparse.y"
#line 365 "htmlparse.y"
{
/* $1 is a PyUnicode */
UserData* ud = yyget_extra(scanner);
@ -1372,9 +1383,10 @@ finish_pi:
break;
case 11:
#line 375 "htmlparse.y"
#line 386 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* parsed HTML CDATA (eg. <![CDATA[spam and eggs ...]]>)
$1 is a PyUnicode with the CDATA content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1396,9 +1408,10 @@ finish_cdata:
break;
case 12:
#line 396 "htmlparse.y"
#line 408 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* parsed HTML doctype (eg. <!DOCTYPE imadoofus system>)
$1 is a PyUnicode with the doctype content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1424,9 +1437,10 @@ finish_doctype:
break;
case 13:
#line 421 "htmlparse.y"
#line 434 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* parsed HTML script content (plus end tag which is omitted)
$1 is a PyUnicode with the script content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1434,6 +1448,7 @@ finish_doctype:
PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
CHECK_ERROR((script == NULL), finish_script);
CALLBACK(ud, "characters", "O", yyvsp[0], finish_script);
/* emit the omitted end tag */
CALLBACK(ud, "end_element", "O", script, finish_script);
CHECK_PARSER_ERROR(ud, finish_script);
finish_script:
@ -1452,9 +1467,10 @@ finish_script:
break;
case 14:
#line 446 "htmlparse.y"
#line 461 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* parsed HTML style content (plus end tag which is omitted)
$1 is a PyUnicode with the style content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1462,6 +1478,7 @@ finish_script:
PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
CHECK_ERROR((style == NULL), finish_style);
CALLBACK(ud, "characters", "O", yyvsp[0], finish_style);
/* emit the omitted end tag */
CALLBACK(ud, "end_element", "O", style, finish_style);
CHECK_PARSER_ERROR(ud, finish_style);
finish_style:
@ -1480,10 +1497,12 @@ finish_style:
break;
case 15:
#line 471 "htmlparse.y"
#line 488 "htmlparse.y"
{
/* $1 is a PyUnicode */
/* Remember this is also called as a lexer error fallback */
/* parsed HTML text data
$1 is a PyUnicode with the text */
/* Remember this is also called as a lexer fallback when no
HTML structure element could be recognized. */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1508,7 +1527,7 @@ finish_characters:
}
/* Line 1010 of yacc.c. */
#line 1512 "htmlparse.c"
#line 1531 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1733,7 +1752,7 @@ yyreturn:
}
#line 494 "htmlparse.y"
#line 513 "htmlparse.y"
/* create parser object */
@ -2028,12 +2047,14 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
}
/* get SAX handler object */
static PyObject* parser_gethandler (parser_object* self, void* closure) {
Py_INCREF(self->handler);
return self->handler;
}
/* set SAX handler object */
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
@ -2047,12 +2068,14 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
}
/* get parser encoding */
static PyObject* parser_getencoding (parser_object* self, void* closure) {
Py_INCREF(self->encoding);
return self->encoding;
}
/* set parser encoding */
static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
@ -2069,12 +2092,14 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
}
/* get parser doctype */
static PyObject* parser_getdoctype (parser_object* self, void* closure) {
Py_INCREF(self->doctype);
return self->doctype;
}
/* set parser doctype */
static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
@ -2174,25 +2199,6 @@ static PyTypeObject parser_type = {
};
/* python module interface
"Create a new HTML parser object with handler (which may be None).\n"
"\n"
"Used callbacks (they don't have to be defined) of a handler are:\n"
"comment(data): <!--data-->\n"
"start_element(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
"end_element(tag): </tag>\n"
"doctype(data): <!DOCTYPE data?>\n"
"pi(name, data=None): <?name data?>\n"
"cdata(data): <![CDATA[data]]>\n"
"characters(data): data\n"
"\n"
"Additionally, there are error and warning callbacks:\n"
"error(msg)\n"
"warning(msg)\n"
"fatal_error(msg)\n"},
*/
static PyMethodDef htmlsax_methods[] = {
{NULL} /* Sentinel */
};

View file

@ -169,7 +169,7 @@ finish_html_end_tag:
%output="htmlparse.c"
%pure_parser
/* parser tokens */
/* parser tokens, see below for what they mean */
%token T_WAIT
%token T_ERROR
%token T_TEXT
@ -183,14 +183,21 @@ finish_html_end_tag:
%token T_CDATA
%token T_DOCTYPE
/* the finish_ labels are for error recovery */
/* note: the finish_ labels are for error recovery */
%%
elements: element {}
| elements element {}
;
elements: element {
/* parse a single element */
}
| elements element {
/* parse a list of elements */
}
;
element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
element: T_WAIT {
/* wait for more lexer input */
YYACCEPT;
}
| T_ERROR
{
/* an error occured in the scanner, the python exception must be set */
@ -200,8 +207,9 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
}
| T_ELEMENT_START
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a PyDict */
/* parsed HTML start tag (eg. <a href="blubb">)
$1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a ListDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -243,8 +251,9 @@ finish_start:
}
| T_ELEMENT_START_END
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a PyDict */
/* parsed HTML start-end tag (eg. <br/>)
$1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyObject, <attrs> is a ListDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -293,7 +302,8 @@ finish_start_end:
}
| T_ELEMENT_END
{
/* $1 is a PyUnicode */
/* parsed HTML end tag (eg. </b>)
$1 is a PyUnicode with the tag name */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -331,7 +341,8 @@ finish_end:
}
| T_COMMENT
{
/* $1 is a PyUnicode */
/* parsed HTML comment (eg. <!-- bla -->)
$1 is a PyUnicode with the comment content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -373,7 +384,8 @@ finish_pi:
}
| T_CDATA
{
/* $1 is a PyUnicode */
/* parsed HTML CDATA (eg. <![CDATA[spam and eggs ...]]>)
$1 is a PyUnicode with the CDATA content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -394,7 +406,8 @@ finish_cdata:
}
| T_DOCTYPE
{
/* $1 is a PyUnicode */
/* parsed HTML doctype (eg. <!DOCTYPE imadoofus system>)
$1 is a PyUnicode with the doctype content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -419,7 +432,8 @@ finish_doctype:
}
| T_SCRIPT
{
/* $1 is a PyUnicode */
/* parsed HTML script content (plus end tag which is omitted)
$1 is a PyUnicode with the script content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -427,6 +441,7 @@ finish_doctype:
PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
CHECK_ERROR((script == NULL), finish_script);
CALLBACK(ud, "characters", "O", $1, finish_script);
/* emit the omitted end tag */
CALLBACK(ud, "end_element", "O", script, finish_script);
CHECK_PARSER_ERROR(ud, finish_script);
finish_script:
@ -444,7 +459,8 @@ finish_script:
}
| T_STYLE
{
/* $1 is a PyUnicode */
/* parsed HTML style content (plus end tag which is omitted)
$1 is a PyUnicode with the style content */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -452,6 +468,7 @@ finish_script:
PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
CHECK_ERROR((style == NULL), finish_style);
CALLBACK(ud, "characters", "O", $1, finish_style);
/* emit the omitted end tag */
CALLBACK(ud, "end_element", "O", style, finish_style);
CHECK_PARSER_ERROR(ud, finish_style);
finish_style:
@ -469,8 +486,10 @@ finish_style:
}
| T_TEXT
{
/* $1 is a PyUnicode */
/* Remember this is also called as a lexer error fallback */
/* parsed HTML text data
$1 is a PyUnicode with the text */
/* Remember this is also called as a lexer fallback when no
HTML structure element could be recognized. */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -785,12 +804,14 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
}
/* get SAX handler object */
static PyObject* parser_gethandler (parser_object* self, void* closure) {
Py_INCREF(self->handler);
return self->handler;
}
/* set SAX handler object */
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
@ -804,12 +825,14 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
}
/* get parser encoding */
static PyObject* parser_getencoding (parser_object* self, void* closure) {
Py_INCREF(self->encoding);
return self->encoding;
}
/* set parser encoding */
static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
@ -826,12 +849,14 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
}
/* get parser doctype */
static PyObject* parser_getdoctype (parser_object* self, void* closure) {
Py_INCREF(self->doctype);
return self->doctype;
}
/* set parser doctype */
static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
@ -931,25 +956,6 @@ static PyTypeObject parser_type = {
};
/* python module interface
"Create a new HTML parser object with handler (which may be None).\n"
"\n"
"Used callbacks (they don't have to be defined) of a handler are:\n"
"comment(data): <!--data-->\n"
"start_element(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
"end_element(tag): </tag>\n"
"doctype(data): <!DOCTYPE data?>\n"
"pi(name, data=None): <?name data?>\n"
"cdata(data): <![CDATA[data]]>\n"
"characters(data): data\n"
"\n"
"Additionally, there are error and warning callbacks:\n"
"error(msg)\n"
"warning(msg)\n"
"fatal_error(msg)\n"},
*/
static PyMethodDef htmlsax_methods[] = {
{NULL} /* Sentinel */
};