mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-18 05:11:00 +00:00
updated documentation
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2730 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
e5197d429a
commit
3ce6aadfd6
3 changed files with 129 additions and 88 deletions
|
|
@ -57,6 +57,35 @@ Fast HTML parser module written in C with the following features:
|
|||
4. Character encoding aware
|
||||
The parser itself is not encoding aware, but all the output are
|
||||
always Python Unicode strings.
|
||||
|
||||
USAGE
|
||||
First make a HTML SAX handler object. Used callbacks (they don't have to
|
||||
be defined) of a handler are:
|
||||
comment(data): <!--data-->
|
||||
start_element(tag, attrs): <tag {attr1:value1, attr2:value2, ..}>
|
||||
start_end_element(tag, attrs): <tag {attr1:value1, attr2:value2, ..}>
|
||||
end_element(tag): </tag>
|
||||
doctype(data): <!DOCTYPE data?>
|
||||
pi(name, data=None): <?name data?>
|
||||
cdata(data): <![CDATA[data]]>
|
||||
characters(data): data
|
||||
|
||||
Additionally, there are error and warning callbacks:
|
||||
warning(msg)
|
||||
error(msg)
|
||||
fatal_error(msg)
|
||||
|
||||
Then create a new HTML parser object with the handler as parameter.
|
||||
|
||||
EXAMPLE
|
||||
# prints out the parsed HTML
|
||||
handler = HtmlParser.htmllib.HtmlPrettyPrinter()
|
||||
parser = HtmlParser.htmlsax.parser(handler)
|
||||
parser.feed("<html><body>Blubb</body></html>")
|
||||
parser.flush()
|
||||
|
||||
*/
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
|
|
|||
|
|
@ -452,8 +452,8 @@ static const yysigned_char yyrhs[] =
|
|||
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
|
||||
static const unsigned short int yyrline[] =
|
||||
{
|
||||
0, 189, 189, 190, 193, 194, 201, 244, 294, 332,
|
||||
353, 374, 395, 420, 445, 470
|
||||
0, 189, 189, 192, 197, 201, 208, 252, 303, 342,
|
||||
364, 385, 407, 433, 460, 487
|
||||
};
|
||||
#endif
|
||||
|
||||
|
|
@ -1160,21 +1160,28 @@ yyreduce:
|
|||
{
|
||||
case 2:
|
||||
#line 189 "htmlparse.y"
|
||||
{;}
|
||||
{
|
||||
/* parse a single element */
|
||||
;}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
#line 190 "htmlparse.y"
|
||||
{;}
|
||||
#line 192 "htmlparse.y"
|
||||
{
|
||||
/* parse a list of elements */
|
||||
;}
|
||||
break;
|
||||
|
||||
case 4:
|
||||
#line 193 "htmlparse.y"
|
||||
{ YYACCEPT; /* wait for more lexer input */ ;}
|
||||
#line 197 "htmlparse.y"
|
||||
{
|
||||
/* wait for more lexer input */
|
||||
YYACCEPT;
|
||||
;}
|
||||
break;
|
||||
|
||||
case 5:
|
||||
#line 195 "htmlparse.y"
|
||||
#line 202 "htmlparse.y"
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1184,10 +1191,11 @@ yyreduce:
|
|||
break;
|
||||
|
||||
case 6:
|
||||
#line 202 "htmlparse.y"
|
||||
#line 209 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
/* parsed HTML start tag (eg. <a href="blubb">)
|
||||
$1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a ListDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1230,10 +1238,11 @@ finish_start:
|
|||
break;
|
||||
|
||||
case 7:
|
||||
#line 245 "htmlparse.y"
|
||||
#line 253 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
/* parsed HTML start-end tag (eg. <br/>)
|
||||
$1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a ListDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1283,9 +1292,10 @@ finish_start_end:
|
|||
break;
|
||||
|
||||
case 8:
|
||||
#line 295 "htmlparse.y"
|
||||
#line 304 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML end tag (eg. </b>)
|
||||
$1 is a PyUnicode with the tag name */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1324,9 +1334,10 @@ finish_end:
|
|||
break;
|
||||
|
||||
case 9:
|
||||
#line 333 "htmlparse.y"
|
||||
#line 343 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML comment (eg. <!-- bla -->)
|
||||
$1 is a PyUnicode with the comment content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1348,7 +1359,7 @@ finish_comment:
|
|||
break;
|
||||
|
||||
case 10:
|
||||
#line 354 "htmlparse.y"
|
||||
#line 365 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1372,9 +1383,10 @@ finish_pi:
|
|||
break;
|
||||
|
||||
case 11:
|
||||
#line 375 "htmlparse.y"
|
||||
#line 386 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML CDATA (eg. <![CDATA[spam and eggs ...]]>)
|
||||
$1 is a PyUnicode with the CDATA content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1396,9 +1408,10 @@ finish_cdata:
|
|||
break;
|
||||
|
||||
case 12:
|
||||
#line 396 "htmlparse.y"
|
||||
#line 408 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML doctype (eg. <!DOCTYPE imadoofus system>)
|
||||
$1 is a PyUnicode with the doctype content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1424,9 +1437,10 @@ finish_doctype:
|
|||
break;
|
||||
|
||||
case 13:
|
||||
#line 421 "htmlparse.y"
|
||||
#line 434 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML script content (plus end tag which is omitted)
|
||||
$1 is a PyUnicode with the script content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1434,6 +1448,7 @@ finish_doctype:
|
|||
PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
|
||||
CHECK_ERROR((script == NULL), finish_script);
|
||||
CALLBACK(ud, "characters", "O", yyvsp[0], finish_script);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", script, finish_script);
|
||||
CHECK_PARSER_ERROR(ud, finish_script);
|
||||
finish_script:
|
||||
|
|
@ -1452,9 +1467,10 @@ finish_script:
|
|||
break;
|
||||
|
||||
case 14:
|
||||
#line 446 "htmlparse.y"
|
||||
#line 461 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML style content (plus end tag which is omitted)
|
||||
$1 is a PyUnicode with the style content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1462,6 +1478,7 @@ finish_script:
|
|||
PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
|
||||
CHECK_ERROR((style == NULL), finish_style);
|
||||
CALLBACK(ud, "characters", "O", yyvsp[0], finish_style);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", style, finish_style);
|
||||
CHECK_PARSER_ERROR(ud, finish_style);
|
||||
finish_style:
|
||||
|
|
@ -1480,10 +1497,12 @@ finish_style:
|
|||
break;
|
||||
|
||||
case 15:
|
||||
#line 471 "htmlparse.y"
|
||||
#line 488 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
/* parsed HTML text data
|
||||
$1 is a PyUnicode with the text */
|
||||
/* Remember this is also called as a lexer fallback when no
|
||||
HTML structure element could be recognized. */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -1508,7 +1527,7 @@ finish_characters:
|
|||
}
|
||||
|
||||
/* Line 1010 of yacc.c. */
|
||||
#line 1512 "htmlparse.c"
|
||||
#line 1531 "htmlparse.c"
|
||||
|
||||
yyvsp -= yylen;
|
||||
yyssp -= yylen;
|
||||
|
|
@ -1733,7 +1752,7 @@ yyreturn:
|
|||
}
|
||||
|
||||
|
||||
#line 494 "htmlparse.y"
|
||||
#line 513 "htmlparse.y"
|
||||
|
||||
|
||||
/* create parser object */
|
||||
|
|
@ -2028,12 +2047,14 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
|
|||
}
|
||||
|
||||
|
||||
/* get SAX handler object */
|
||||
static PyObject* parser_gethandler (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->handler);
|
||||
return self->handler;
|
||||
}
|
||||
|
||||
|
||||
/* set SAX handler object */
|
||||
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
|
||||
|
|
@ -2047,12 +2068,14 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
|
|||
}
|
||||
|
||||
|
||||
/* get parser encoding */
|
||||
static PyObject* parser_getencoding (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->encoding);
|
||||
return self->encoding;
|
||||
}
|
||||
|
||||
|
||||
/* set parser encoding */
|
||||
static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
|
||||
|
|
@ -2069,12 +2092,14 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
|
|||
}
|
||||
|
||||
|
||||
/* get parser doctype */
|
||||
static PyObject* parser_getdoctype (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->doctype);
|
||||
return self->doctype;
|
||||
}
|
||||
|
||||
|
||||
/* set parser doctype */
|
||||
static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
|
||||
|
|
@ -2174,25 +2199,6 @@ static PyTypeObject parser_type = {
|
|||
};
|
||||
|
||||
|
||||
/* python module interface
|
||||
"Create a new HTML parser object with handler (which may be None).\n"
|
||||
"\n"
|
||||
"Used callbacks (they don't have to be defined) of a handler are:\n"
|
||||
"comment(data): <!--data-->\n"
|
||||
"start_element(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
|
||||
"end_element(tag): </tag>\n"
|
||||
"doctype(data): <!DOCTYPE data?>\n"
|
||||
"pi(name, data=None): <?name data?>\n"
|
||||
"cdata(data): <![CDATA[data]]>\n"
|
||||
"characters(data): data\n"
|
||||
"\n"
|
||||
"Additionally, there are error and warning callbacks:\n"
|
||||
"error(msg)\n"
|
||||
"warning(msg)\n"
|
||||
"fatal_error(msg)\n"},
|
||||
|
||||
*/
|
||||
|
||||
static PyMethodDef htmlsax_methods[] = {
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
|
|
|||
|
|
@ -169,7 +169,7 @@ finish_html_end_tag:
|
|||
%output="htmlparse.c"
|
||||
%pure_parser
|
||||
|
||||
/* parser tokens */
|
||||
/* parser tokens, see below for what they mean */
|
||||
%token T_WAIT
|
||||
%token T_ERROR
|
||||
%token T_TEXT
|
||||
|
|
@ -183,14 +183,21 @@ finish_html_end_tag:
|
|||
%token T_CDATA
|
||||
%token T_DOCTYPE
|
||||
|
||||
/* the finish_ labels are for error recovery */
|
||||
/* note: the finish_ labels are for error recovery */
|
||||
%%
|
||||
|
||||
elements: element {}
|
||||
| elements element {}
|
||||
;
|
||||
elements: element {
|
||||
/* parse a single element */
|
||||
}
|
||||
| elements element {
|
||||
/* parse a list of elements */
|
||||
}
|
||||
;
|
||||
|
||||
element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
|
||||
element: T_WAIT {
|
||||
/* wait for more lexer input */
|
||||
YYACCEPT;
|
||||
}
|
||||
| T_ERROR
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
|
|
@ -200,8 +207,9 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
|
|||
}
|
||||
| T_ELEMENT_START
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
/* parsed HTML start tag (eg. <a href="blubb">)
|
||||
$1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a ListDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -243,8 +251,9 @@ finish_start:
|
|||
}
|
||||
| T_ELEMENT_START_END
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
/* parsed HTML start-end tag (eg. <br/>)
|
||||
$1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyObject, <attrs> is a ListDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -293,7 +302,8 @@ finish_start_end:
|
|||
}
|
||||
| T_ELEMENT_END
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML end tag (eg. </b>)
|
||||
$1 is a PyUnicode with the tag name */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -331,7 +341,8 @@ finish_end:
|
|||
}
|
||||
| T_COMMENT
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML comment (eg. <!-- bla -->)
|
||||
$1 is a PyUnicode with the comment content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -373,7 +384,8 @@ finish_pi:
|
|||
}
|
||||
| T_CDATA
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML CDATA (eg. <![CDATA[spam and eggs ...]]>)
|
||||
$1 is a PyUnicode with the CDATA content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -394,7 +406,8 @@ finish_cdata:
|
|||
}
|
||||
| T_DOCTYPE
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML doctype (eg. <!DOCTYPE imadoofus system>)
|
||||
$1 is a PyUnicode with the doctype content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -419,7 +432,8 @@ finish_doctype:
|
|||
}
|
||||
| T_SCRIPT
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML script content (plus end tag which is omitted)
|
||||
$1 is a PyUnicode with the script content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -427,6 +441,7 @@ finish_doctype:
|
|||
PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
|
||||
CHECK_ERROR((script == NULL), finish_script);
|
||||
CALLBACK(ud, "characters", "O", $1, finish_script);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", script, finish_script);
|
||||
CHECK_PARSER_ERROR(ud, finish_script);
|
||||
finish_script:
|
||||
|
|
@ -444,7 +459,8 @@ finish_script:
|
|||
}
|
||||
| T_STYLE
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* parsed HTML style content (plus end tag which is omitted)
|
||||
$1 is a PyUnicode with the style content */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -452,6 +468,7 @@ finish_script:
|
|||
PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
|
||||
CHECK_ERROR((style == NULL), finish_style);
|
||||
CALLBACK(ud, "characters", "O", $1, finish_style);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", style, finish_style);
|
||||
CHECK_PARSER_ERROR(ud, finish_style);
|
||||
finish_style:
|
||||
|
|
@ -469,8 +486,10 @@ finish_style:
|
|||
}
|
||||
| T_TEXT
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
/* parsed HTML text data
|
||||
$1 is a PyUnicode with the text */
|
||||
/* Remember this is also called as a lexer fallback when no
|
||||
HTML structure element could be recognized. */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -785,12 +804,14 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
|
|||
}
|
||||
|
||||
|
||||
/* get SAX handler object */
|
||||
static PyObject* parser_gethandler (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->handler);
|
||||
return self->handler;
|
||||
}
|
||||
|
||||
|
||||
/* set SAX handler object */
|
||||
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
|
||||
|
|
@ -804,12 +825,14 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
|
|||
}
|
||||
|
||||
|
||||
/* get parser encoding */
|
||||
static PyObject* parser_getencoding (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->encoding);
|
||||
return self->encoding;
|
||||
}
|
||||
|
||||
|
||||
/* set parser encoding */
|
||||
static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
|
||||
|
|
@ -826,12 +849,14 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
|
|||
}
|
||||
|
||||
|
||||
/* get parser doctype */
|
||||
static PyObject* parser_getdoctype (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->doctype);
|
||||
return self->doctype;
|
||||
}
|
||||
|
||||
|
||||
/* set parser doctype */
|
||||
static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
|
||||
|
|
@ -931,25 +956,6 @@ static PyTypeObject parser_type = {
|
|||
};
|
||||
|
||||
|
||||
/* python module interface
|
||||
"Create a new HTML parser object with handler (which may be None).\n"
|
||||
"\n"
|
||||
"Used callbacks (they don't have to be defined) of a handler are:\n"
|
||||
"comment(data): <!--data-->\n"
|
||||
"start_element(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
|
||||
"end_element(tag): </tag>\n"
|
||||
"doctype(data): <!DOCTYPE data?>\n"
|
||||
"pi(name, data=None): <?name data?>\n"
|
||||
"cdata(data): <![CDATA[data]]>\n"
|
||||
"characters(data): data\n"
|
||||
"\n"
|
||||
"Additionally, there are error and warning callbacks:\n"
|
||||
"error(msg)\n"
|
||||
"warning(msg)\n"
|
||||
"fatal_error(msg)\n"},
|
||||
|
||||
*/
|
||||
|
||||
static PyMethodDef htmlsax_methods[] = {
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in a new issue