updated documentation

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2730 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-18 05:11:00 +00:00 · 2005-07-11 14:37:52 +00:00 · 2005-07-11 14:37:52 +00:00 · 3ce6aadfd6
commit 3ce6aadfd6
parent e5197d429a
3 changed files with 129 additions and 88 deletions
--- a/linkcheck/HtmlParser/init.py
+++ b/linkcheck/HtmlParser/init.py
@ -57,6 +57,35 @@ Fast HTML parser module written in C with the following features:
 4. Character encoding aware
    The parser itself is not encoding aware, but all the output are
    always Python Unicode strings.
+
+USAGE
+First make a HTML SAX handler object. Used callbacks (they don't have to
+be defined) of a handler are:
+  comment(data): <!--data-->
+  start_element(tag, attrs): <tag {attr1:value1, attr2:value2, ..}>
+  start_end_element(tag, attrs): <tag {attr1:value1, attr2:value2, ..}>
+  end_element(tag): </tag>
+  doctype(data): <!DOCTYPE data?>
+  pi(name, data=None): <?name data?>
+  cdata(data): <![CDATA[data]]>
+  characters(data): data
+
+Additionally, there are error and warning callbacks:
+  warning(msg)
+  error(msg)
+  fatal_error(msg)
+
+Then create a new HTML parser object with the handler as parameter.
+
+EXAMPLE
+  # prints out the parsed HTML
+  handler = HtmlParser.htmllib.HtmlPrettyPrinter()
+  parser = HtmlParser.htmlsax.parser(handler)
+  parser.feed("<html><body>Blubb</body></html>")
+  parser.flush()
+
+*/
+
 """

 import re
--- a/linkcheck/HtmlParser/htmlparse.c
+++ b/linkcheck/HtmlParser/htmlparse.c
@ -452,8 +452,8 @@ static const yysigned_char yyrhs[] =
 /* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
 static const unsigned short int yyrline[] =
 {
-       0,   189,   189,   190,   193,   194,   201,   244,   294,   332,
-     353,   374,   395,   420,   445,   470
+       0,   189,   189,   192,   197,   201,   208,   252,   303,   342,
+     364,   385,   407,   433,   460,   487
 };
 #endif

@ -1160,21 +1160,28 @@ yyreduce:
    {
        case 2:
 #line 189 "htmlparse.y"
-    {;}
+    {
+    /* parse a single element */
+;}
    break;

  case 3:
-#line 190 "htmlparse.y"
-    {;}
+#line 192 "htmlparse.y"
+    {
+    /* parse a list of elements */
+;}
    break;

  case 4:
-#line 193 "htmlparse.y"
-    { YYACCEPT; /* wait for more lexer input */ ;}
+#line 197 "htmlparse.y"
+    {
+    /* wait for more lexer input */
+    YYACCEPT;
+;}
    break;

  case 5:
-#line 195 "htmlparse.y"
+#line 202 "htmlparse.y"
    {
    /* an error occured in the scanner, the python exception must be set */
    UserData* ud = yyget_extra(scanner);
@ -1184,10 +1191,11 @@ yyreduce:
    break;

  case 6:
-#line 202 "htmlparse.y"
+#line 209 "htmlparse.y"
    {
-    /* $1 is a PyTuple (<tag>, <attrs>)
-       <tag> is a PyObject, <attrs> is a PyDict */
+    /* parsed HTML start tag (eg. <a href="blubb">)
+       $1 is a PyTuple (<tag>, <attrs>)
+       <tag> is a PyObject, <attrs> is a ListDict */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1230,10 +1238,11 @@ finish_start:
    break;

  case 7:
-#line 245 "htmlparse.y"
+#line 253 "htmlparse.y"
    {
-    /* $1 is a PyTuple (<tag>, <attrs>)
-       <tag> is a PyObject, <attrs> is a PyDict */
+    /* parsed HTML start-end tag (eg. <br/>)
+       $1 is a PyTuple (<tag>, <attrs>)
+       <tag> is a PyObject, <attrs> is a ListDict */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1283,9 +1292,10 @@ finish_start_end:
    break;

  case 8:
-#line 295 "htmlparse.y"
+#line 304 "htmlparse.y"
    {
-    /* $1 is a PyUnicode */
+    /* parsed HTML end tag (eg. </b>)
+       $1 is a PyUnicode with the tag name */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1324,9 +1334,10 @@ finish_end:
    break;

  case 9:
-#line 333 "htmlparse.y"
+#line 343 "htmlparse.y"
    {
-    /* $1 is a PyUnicode */
+    /* parsed HTML comment (eg. <!-- bla -->)
+       $1 is a PyUnicode with the comment content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1348,7 +1359,7 @@ finish_comment:
    break;

  case 10:
-#line 354 "htmlparse.y"
+#line 365 "htmlparse.y"
    {
    /* $1 is a PyUnicode */
    UserData* ud = yyget_extra(scanner);
@ -1372,9 +1383,10 @@ finish_pi:
    break;

  case 11:
-#line 375 "htmlparse.y"
+#line 386 "htmlparse.y"
    {
-    /* $1 is a PyUnicode */
+    /* parsed HTML CDATA (eg. <![CDATA[spam and eggs ...]]>)
+       $1 is a PyUnicode with the CDATA content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1396,9 +1408,10 @@ finish_cdata:
    break;

  case 12:
-#line 396 "htmlparse.y"
+#line 408 "htmlparse.y"
    {
-    /* $1 is a PyUnicode */
+    /* parsed HTML doctype (eg. <!DOCTYPE imadoofus system>)
+       $1 is a PyUnicode with the doctype content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1424,9 +1437,10 @@ finish_doctype:
    break;

  case 13:
-#line 421 "htmlparse.y"
+#line 434 "htmlparse.y"
    {
-    /* $1 is a PyUnicode */
+    /* parsed HTML script content (plus end tag which is omitted)
+       $1 is a PyUnicode with the script content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1434,6 +1448,7 @@ finish_doctype:
    PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
    CHECK_ERROR((script == NULL), finish_script);
    CALLBACK(ud, "characters", "O", yyvsp[0], finish_script);
+    /* emit the omitted end tag */
    CALLBACK(ud, "end_element", "O", script, finish_script);
    CHECK_PARSER_ERROR(ud, finish_script);
 finish_script:
@ -1452,9 +1467,10 @@ finish_script:
    break;

  case 14:
-#line 446 "htmlparse.y"
+#line 461 "htmlparse.y"
    {
-    /* $1 is a PyUnicode */
+    /* parsed HTML style content (plus end tag which is omitted)
+       $1 is a PyUnicode with the style content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1462,6 +1478,7 @@ finish_script:
    PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
    CHECK_ERROR((style == NULL), finish_style);
    CALLBACK(ud, "characters", "O", yyvsp[0], finish_style);
+    /* emit the omitted end tag */
    CALLBACK(ud, "end_element", "O", style, finish_style);
    CHECK_PARSER_ERROR(ud, finish_style);
 finish_style:
@ -1480,10 +1497,12 @@ finish_style:
    break;

  case 15:
-#line 471 "htmlparse.y"
+#line 488 "htmlparse.y"
    {
-    /* $1 is a PyUnicode */
-    /* Remember this is also called as a lexer error fallback */
+    /* parsed HTML text data
+       $1 is a PyUnicode with the text */
+    /* Remember this is also called as a lexer fallback when no
+       HTML structure element could be recognized. */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -1508,7 +1527,7 @@ finish_characters:
    }

 /* Line 1010 of yacc.c.  */
-#line 1512 "htmlparse.c"
+#line 1531 "htmlparse.c"

  yyvsp -= yylen;
  yyssp -= yylen;
@ -1733,7 +1752,7 @@ yyreturn:
 }


-#line 494 "htmlparse.y"
+#line 513 "htmlparse.y"


 /* create parser object */
@ -2028,12 +2047,14 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
 }


+/* get SAX handler object */
 static PyObject* parser_gethandler (parser_object* self, void* closure) {
    Py_INCREF(self->handler);
    return self->handler;
 }


+/* set SAX handler object */
 static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
    if (value == NULL) {
       PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
@ -2047,12 +2068,14 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
 }


+/* get parser encoding */
 static PyObject* parser_getencoding (parser_object* self, void* closure) {
    Py_INCREF(self->encoding);
    return self->encoding;
 }


+/* set parser encoding */
 static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
    if (value == NULL) {
        PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
@ -2069,12 +2092,14 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
 }


+/* get parser doctype */
 static PyObject* parser_getdoctype (parser_object* self, void* closure) {
    Py_INCREF(self->doctype);
    return self->doctype;
 }


+/* set parser doctype */
 static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
    if (value == NULL) {
        PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
@ -2174,25 +2199,6 @@ static PyTypeObject parser_type = {
 };


-/* python module interface 
-     "Create a new HTML parser object with handler (which may be None).\n"
-     "\n"
-     "Used callbacks (they don't have to be defined) of a handler are:\n"
-     "comment(data): <!--data-->\n"
-     "start_element(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
-     "end_element(tag): </tag>\n"
-     "doctype(data): <!DOCTYPE data?>\n"
-     "pi(name, data=None): <?name data?>\n"
-     "cdata(data): <![CDATA[data]]>\n"
-     "characters(data): data\n"
-     "\n"
-     "Additionally, there are error and warning callbacks:\n"
-     "error(msg)\n"
-     "warning(msg)\n"
-     "fatal_error(msg)\n"},
-
-*/
-
 static PyMethodDef htmlsax_methods[] = {
    {NULL} /* Sentinel */
 };
--- a/linkcheck/HtmlParser/htmlparse.y
+++ b/linkcheck/HtmlParser/htmlparse.y
@ -169,7 +169,7 @@ finish_html_end_tag:
 %output="htmlparse.c"
 %pure_parser

-/* parser tokens */
+/* parser tokens, see below for what they mean */
 %token T_WAIT
 %token T_ERROR
 %token T_TEXT
@ -183,14 +183,21 @@ finish_html_end_tag:
 %token T_CDATA
 %token T_DOCTYPE

-/* the finish_ labels are for error recovery */
+/* note: the finish_ labels are for error recovery */
 %%

-elements: element {}
-    | elements element {}
-    ;
+elements: element {
+    /* parse a single element */
+}
+| elements element {
+    /* parse a list of elements */
+}
+;

-element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
+element: T_WAIT {
+    /* wait for more lexer input */
+    YYACCEPT;
+}
 | T_ERROR
 {
    /* an error occured in the scanner, the python exception must be set */
@ -200,8 +207,9 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
 }
 | T_ELEMENT_START
 {
-    /* $1 is a PyTuple (<tag>, <attrs>)
-       <tag> is a PyObject, <attrs> is a PyDict */
+    /* parsed HTML start tag (eg. <a href="blubb">)
+       $1 is a PyTuple (<tag>, <attrs>)
+       <tag> is a PyObject, <attrs> is a ListDict */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -243,8 +251,9 @@ finish_start:
 }
 | T_ELEMENT_START_END
 {
-    /* $1 is a PyTuple (<tag>, <attrs>)
-       <tag> is a PyObject, <attrs> is a PyDict */
+    /* parsed HTML start-end tag (eg. <br/>)
+       $1 is a PyTuple (<tag>, <attrs>)
+       <tag> is a PyObject, <attrs> is a ListDict */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -293,7 +302,8 @@ finish_start_end:
 }
 | T_ELEMENT_END
 {
-    /* $1 is a PyUnicode */
+    /* parsed HTML end tag (eg. </b>)
+       $1 is a PyUnicode with the tag name */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -331,7 +341,8 @@ finish_end:
 }
 | T_COMMENT
 {
-    /* $1 is a PyUnicode */
+    /* parsed HTML comment (eg. <!-- bla -->)
+       $1 is a PyUnicode with the comment content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -373,7 +384,8 @@ finish_pi:
 }
 | T_CDATA
 {
-    /* $1 is a PyUnicode */
+    /* parsed HTML CDATA (eg. <![CDATA[spam and eggs ...]]>)
+       $1 is a PyUnicode with the CDATA content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -394,7 +406,8 @@ finish_cdata:
 }
 | T_DOCTYPE
 {
-    /* $1 is a PyUnicode */
+    /* parsed HTML doctype (eg. <!DOCTYPE imadoofus system>)
+       $1 is a PyUnicode with the doctype content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -419,7 +432,8 @@ finish_doctype:
 }
 | T_SCRIPT
 {
-    /* $1 is a PyUnicode */
+    /* parsed HTML script content (plus end tag which is omitted)
+       $1 is a PyUnicode with the script content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -427,6 +441,7 @@ finish_doctype:
    PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
    CHECK_ERROR((script == NULL), finish_script);
    CALLBACK(ud, "characters", "O", $1, finish_script);
+    /* emit the omitted end tag */
    CALLBACK(ud, "end_element", "O", script, finish_script);
    CHECK_PARSER_ERROR(ud, finish_script);
 finish_script:
@ -444,7 +459,8 @@ finish_script:
 }
 | T_STYLE
 {
-    /* $1 is a PyUnicode */
+    /* parsed HTML style content (plus end tag which is omitted)
+       $1 is a PyUnicode with the style content */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -452,6 +468,7 @@ finish_script:
    PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
    CHECK_ERROR((style == NULL), finish_style);
    CALLBACK(ud, "characters", "O", $1, finish_style);
+    /* emit the omitted end tag */
    CALLBACK(ud, "end_element", "O", style, finish_style);
    CHECK_PARSER_ERROR(ud, finish_style);
 finish_style:
@ -469,8 +486,10 @@ finish_style:
 }
 | T_TEXT
 {
-    /* $1 is a PyUnicode */
-    /* Remember this is also called as a lexer error fallback */
+    /* parsed HTML text data
+       $1 is a PyUnicode with the text */
+    /* Remember this is also called as a lexer fallback when no
+       HTML structure element could be recognized. */
    UserData* ud = yyget_extra(scanner);
    PyObject* callback = NULL;
    PyObject* result = NULL;
@ -785,12 +804,14 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
 }


+/* get SAX handler object */
 static PyObject* parser_gethandler (parser_object* self, void* closure) {
    Py_INCREF(self->handler);
    return self->handler;
 }


+/* set SAX handler object */
 static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
    if (value == NULL) {
       PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
@ -804,12 +825,14 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
 }


+/* get parser encoding */
 static PyObject* parser_getencoding (parser_object* self, void* closure) {
    Py_INCREF(self->encoding);
    return self->encoding;
 }


+/* set parser encoding */
 static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
    if (value == NULL) {
        PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
@ -826,12 +849,14 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
 }


+/* get parser doctype */
 static PyObject* parser_getdoctype (parser_object* self, void* closure) {
    Py_INCREF(self->doctype);
    return self->doctype;
 }


+/* set parser doctype */
 static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
    if (value == NULL) {
        PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
@ -931,25 +956,6 @@ static PyTypeObject parser_type = {
 };


-/* python module interface 
-     "Create a new HTML parser object with handler (which may be None).\n"
-     "\n"
-     "Used callbacks (they don't have to be defined) of a handler are:\n"
-     "comment(data): <!--data-->\n"
-     "start_element(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
-     "end_element(tag): </tag>\n"
-     "doctype(data): <!DOCTYPE data?>\n"
-     "pi(name, data=None): <?name data?>\n"
-     "cdata(data): <![CDATA[data]]>\n"
-     "characters(data): data\n"
-     "\n"
-     "Additionally, there are error and warning callbacks:\n"
-     "error(msg)\n"
-     "warning(msg)\n"
-     "fatal_error(msg)\n"},
-
-*/
-
 static PyMethodDef htmlsax_methods[] = {
    {NULL} /* Sentinel */
 };