diff --git a/MANIFEST.in b/MANIFEST.in
index 7ca73b44..e337532c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -7,11 +7,6 @@ include cgi-bin/lc.wsgi cgi-bin/README
include Makefile
include cgi-bin/lconline/*.html cgi-bin/lconline/*.de cgi-bin/lconline/*.en
include cgi-bin/lconline/*.js cgi-bin/lconline/*.css cgi-bin/lconline/*.ico
-include linkcheck/HtmlParser/Makefile
-include linkcheck/HtmlParser/htmllex.l
-include linkcheck/HtmlParser/htmlparse.y
-include linkcheck/HtmlParser/*.h
-include linkcheck/HtmlParser/fixincludes.awk
include po/*.po po/*.mo po/*.pot po/Makefile
include doc/*.example doc/*.txt
include doc/html/*.ico
diff --git a/Makefile b/Makefile
index 7b98489c..4ddecb91 100644
--- a/Makefile
+++ b/Makefile
@@ -53,7 +53,6 @@ all:
clean:
-$(PYTHON) setup.py clean --all
rm -f $(LAPPNAME)-out.* *-stamp*
- $(MAKE) -C linkcheck/HtmlParser clean
find . -name '*.py[co]' -exec rm -f {} \;
find . -name '*.bak' -exec rm -f {} \;
find . -depth -name '__pycache__' -exec rm -rf {} \;
@@ -75,9 +74,7 @@ locale:
# to build in the current directory
localbuild: MANIFEST locale
- $(MAKE) -C linkcheck/HtmlParser
$(PYTHON) setup.py build
- cp -f build/lib.$(PLATFORM)-$(PYVER)*/linkcheck/HtmlParser/htmlsax*.so linkcheck/HtmlParser
release: distclean releasecheck filescheck
$(MAKE) dist sign register upload homepage tag changelog deb
diff --git a/linkcheck/HtmlParser/Makefile b/linkcheck/HtmlParser/Makefile
deleted file mode 100644
index d7bcf8e9..00000000
--- a/linkcheck/HtmlParser/Makefile
+++ /dev/null
@@ -1,29 +0,0 @@
-# This HTML parser needs flex >= 2.5.xx from http://lex.sf.net/ for
-# reentrant bison parser support and uses features of bison >= 3.0.x
-LEX = flex
-YACC = bison
-PYINCLUDE=-I/usr/include/python2.7
-
-all: htmllex.c htmlparse.c
-
-htmlsax.so: htmllex.o htmlparse.o s_util.o
- gcc -pthread -shared $^ -o htmlsax.so
-
-%.o: %.c
- gcc -g -std=c99 -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. $(PYINCLUDE) -c $< -o $@
-
-htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
- $(YACC) --output=htmlparse.c htmlparse.y
-
-htmllex.l: htmlparse.h
-
-htmllex.c: htmllex.l htmlsax.h
- $(LEX) htmllex.l
- awk -f fixincludes.awk htmllex.c > htmllex.c.fixed; mv -f htmllex.c.fixed htmllex.c
-
-clean:
- rm -f *.o *.so *.pyc *.pyo *.output
-
-distclean: clean
- rm -f htmlparse.c htmlparse.h htmllex.c
-
diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py
index b3bcbc66..b7d07385 100644
--- a/linkcheck/HtmlParser/__init__.py
+++ b/linkcheck/HtmlParser/__init__.py
@@ -15,64 +15,7 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
-Fast HTML parser module written in C with the following features:
-
-- Reentrant
- As soon as any HTML string data is available, we try to feed it
- to the HTML parser. This means that the parser has to scan possible
- incomplete data, recognizing as much as it can. Incomplete trailing
- data is saved for subsequent calls, or it is just flushed into the
- output buffer with the flush() function.
- A reset() brings the parser back to its initial state, throwing away all
- buffered data.
-
-- Coping with HTML syntax errors
- The parser recognizes as much as it can and passes the rest
- of the data as TEXT tokens.
- The scanner only passes complete recognized HTML syntax elements to
- the parser. Invalid syntax elements are passed as TEXT. This way we do
- not need the bison error recovery.
- Incomplete data is rescanned the next time the parser calls yylex() or
- when it is being flush()ed.
-
- The following syntax errors will be recognized correctly:
-
- - Unquoted attribute values.
- - Missing beginning quote of attribute values.
- - Invalid "" end tags in script modus.
- - Missing ">" in tags.
- - Invalid characters in tag or attribute names.
-
- The following syntax errors will not be recognized:
-
- - Missing end quote of attribute values. On the TODO list.
- - Unknown HTML tag or attribute names.
- - Invalid nesting of tags.
-
- Additionally the parser has the following features:
-
- - NULL bytes are changed into spaces
- - inside a is matched, but not itself */
-case 47:
-/* rule 47 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_bp + 1);
-yyg->yy_c_buf_p = yy_cp = yy_bp + 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 556 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 48:
-/* rule 48 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 561 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 49:
-/* rule 49 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 566 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 50:
-/* rule 50 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 571 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 51:
-/* rule 51 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 576 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 52:
-/* rule 52 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 581 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 53:
-/* rule 53 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 586 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 54:
-/* rule 54 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 591 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 55:
-/* rule 55 can match eol */
-YY_RULE_SETUP
-#line 596 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 56:
-YY_RULE_SETUP
-#line 600 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_APOS_ESC);
-}
- YY_BREAK
-case 57:
-/* rule 57 can match eol */
-YY_RULE_SETUP
-#line 606 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 58:
-YY_RULE_SETUP
-#line 611 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
- YY_BREAK
-case 59:
-/* rule 59 can match eol */
-YY_RULE_SETUP
-#line 617 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_APOS);
-}
- YY_BREAK
-case 60:
-YY_RULE_SETUP
-#line 623 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_STRING_ESC);
-}
- YY_BREAK
-case 61:
-/* rule 61 can match eol */
-YY_RULE_SETUP
-#line 629 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 62:
-YY_RULE_SETUP
-#line 634 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
- YY_BREAK
-case 63:
-/* rule 63 can match eol */
-YY_RULE_SETUP
-#line 640 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_STRING);
-}
- YY_BREAK
-case 64:
-YY_RULE_SETUP
-#line 646 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 65:
-/* rule 65 can match eol */
-YY_RULE_SETUP
-#line 651 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
- YY_BREAK
-case 66:
-/* rule 66 can match eol */
-YY_RULE_SETUP
-#line 657 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 67:
-/* rule 67 can match eol */
-YY_RULE_SETUP
-#line 661 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 68:
-YY_RULE_SETUP
-#line 666 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
- YY_BREAK
-/*********************** STYLE ************************/
-case 69:
-/* rule 69 can match eol */
-YY_RULE_SETUP
-#line 673 "htmllex.l"
-{
- UPDATE_LINE;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_STYLE);
-}
- YY_BREAK
-case 70:
-/* rule 70 can match eol */
-YY_RULE_SETUP
-#line 680 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-/* this is so shitty */
-case 71:
-/* rule 71 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_bp + 1);
-yyg->yy_c_buf_p = yy_cp = yy_bp + 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 686 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 72:
-/* rule 72 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 691 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 73:
-/* rule 73 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 696 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 74:
-/* rule 74 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 701 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 75:
-/* rule 75 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 706 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 76:
-/* rule 76 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 711 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 77:
-/* rule 77 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 716 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 78:
-/* rule 78 can match eol */
-YY_RULE_SETUP
-#line 721 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-/*********************** ATTRS ************************/
-case 79:
-YY_RULE_SETUP
-#line 726 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_ATTR2);
-}
- YY_BREAK
-case 80:
-YY_RULE_SETUP
-#line 732 "htmllex.l"
-{
- UPDATE_COLUMN;
- FLUSH_ATTRS;
- BEGIN(INITIAL);
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START_END);
-}
- YY_BREAK
-case 81:
-/* rule 81 can match eol */
-YY_RULE_SETUP
-#line 740 "htmllex.l"
-{
- UPDATE_LINE;
-}
- YY_BREAK
-case 82:
-YY_RULE_SETUP
-#line 744 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 83:
-YY_RULE_SETUP
-#line 748 "htmllex.l"
-{
- UPDATE_COLUMN;
- FLUSH_ATTRS;
- SCRIPT_CHECK;
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START);
-}
- YY_BREAK
-case 84:
-YY_RULE_SETUP
-#line 756 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 85:
-/* rule 85 can match eol */
-YY_RULE_SETUP
-#line 761 "htmllex.l"
-{
- /* Line continuations */
- UPDATE_LINE;
-}
- YY_BREAK
-case 86:
-YY_RULE_SETUP
-#line 766 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 87:
-YY_RULE_SETUP
-#line 771 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 88:
-/* rule 88 can match eol */
-YY_RULE_SETUP
-#line 775 "htmllex.l"
-{
- UPDATE_LINE;
- BEGIN(S_ATTR3);
-}
- YY_BREAK
-case 89:
-/* rule 89 can match eol */
-YY_RULE_SETUP
-#line 780 "htmllex.l"
-{
- UPDATE_LINE;
- LOWER_TMP;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrname);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- BEGIN(S_ATTR4);
-}
- YY_BREAK
-case 90:
-YY_RULE_SETUP
-#line 788 "htmllex.l"
-{
- UPDATE_COLUMN;
- LOWER_TMP;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrname);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- if (yyextra->tmp_attrval != NULL) return T_ERROR;
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None));
- Py_CLEAR(yyextra->tmp_attrname);
- APPEND_TO_TMP(yyleng);
- BEGIN(S_ATTR2);
-}
- YY_BREAK
-case 91:
-/* rule 91 can match eol */
-YY_RULE_SETUP
-#line 800 "htmllex.l"
-{
- /* this also skips whitespace! */
- UPDATE_LINE;
-}
- YY_BREAK
-case 92:
-YY_RULE_SETUP
-#line 805 "htmllex.l"
-{
- /* backslash escapes seen at freecode.com */
- UPDATE_COLUMN;
- BEGIN(S_STRING);
-}
- YY_BREAK
-case 93:
-YY_RULE_SETUP
-#line 811 "htmllex.l"
-{
- UPDATE_COLUMN;
- BEGIN(S_STRING);
-}
- YY_BREAK
-case 94:
-YY_RULE_SETUP
-#line 816 "htmllex.l"
-{
- UPDATE_COLUMN;
- BEGIN(S_APOSSTRING);
-}
- YY_BREAK
-case 95:
-YY_RULE_SETUP
-#line 821 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_ATTR5);
-}
- YY_BREAK
-case 96:
-YY_RULE_SETUP
-#line 827 "htmllex.l"
-{
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- SCRIPT_CHECK;
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START);
-}
- YY_BREAK
-case 97:
-/* rule 97 can match eol */
-YY_RULE_SETUP
-#line 842 "htmllex.l"
-{
- UPDATE_LINE;
-}
- YY_BREAK
-case 98:
-YY_RULE_SETUP
-#line 846 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 99:
-YY_RULE_SETUP
-#line 851 "htmllex.l"
-{
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- SCRIPT_CHECK;
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START);
-}
- YY_BREAK
-case 100:
-YY_RULE_SETUP
-#line 866 "htmllex.l"
-{
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(INITIAL);
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START_END);
-}
- YY_BREAK
-case 101:
-YY_RULE_SETUP
-#line 881 "htmllex.l"
-{
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
- YY_BREAK
-case 102:
-/* rule 102 can match eol */
-YY_RULE_SETUP
-#line 894 "htmllex.l"
-{
- UPDATE_LINE;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
- YY_BREAK
-case 103:
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-yyg->yy_c_buf_p = yy_cp = yy_bp + 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 907 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_APOSSTRING_ESC);
-}
- YY_BREAK
-case 104:
-YY_RULE_SETUP
-#line 913 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 105:
-YY_RULE_SETUP
-#line 917 "htmllex.l"
-{
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
- YY_BREAK
-case 106:
-/* rule 106 can match eol */
-YY_RULE_SETUP
-#line 930 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 107:
-/* rule 107 can match eol */
-YY_RULE_SETUP
-#line 936 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_APOSSTRING);
-}
- YY_BREAK
-case 108:
-YY_RULE_SETUP
-#line 942 "htmllex.l"
-{
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
- YY_BREAK
-case 109:
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-yyg->yy_c_buf_p = yy_cp = yy_bp + 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 955 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_STRING_ESC);
-}
- YY_BREAK
-case 110:
-/* rule 110 can match eol */
-YY_RULE_SETUP
-#line 961 "htmllex.l"
-{
- UPDATE_LINE;
-}
- YY_BREAK
-case 111:
-YY_RULE_SETUP
-#line 965 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 112:
-/* rule 112 can match eol */
-YY_RULE_SETUP
-#line 969 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 113:
-/* rule 113 can match eol */
-YY_RULE_SETUP
-#line 974 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_STRING);
-}
- YY_BREAK
-/*********************** TAGEND ************************/
-case 114:
-/* rule 114 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 982 "htmllex.l"
-{
- UPDATE_LINE;
- BEGIN(S_TAGEND);
-}
- YY_BREAK
-case 115:
-YY_RULE_SETUP
-#line 987 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
- YY_BREAK
-case 116:
-/* rule 116 can match eol */
-YY_RULE_SETUP
-#line 992 "htmllex.l"
-{
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(INITIAL);
- RETURN(T_ELEMENT_END);
-}
- YY_BREAK
-case 117:
-/* rule 117 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 1000 "htmllex.l"
-{
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(S_TAGEND);
- RETURN(T_ELEMENT_END);
-}
- YY_BREAK
-case 118:
-/* rule 118 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 1008 "htmllex.l"
-{
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- CHECK_NULL(yyextra->tmp_attrs = PyObject_CallObject(yyextra->list_dict, NULL));
- BEGIN(S_TAGSTART);
- RETURN(T_ELEMENT_END);
-}
- YY_BREAK
-case 119:
-/* rule 119 can match eol */
-YY_RULE_SETUP
-#line 1017 "htmllex.l"
-{
- UPDATE_LINE;
- /* ignore any trailing garbage of this end tag */
- BEGIN(S_TAGEND2);
-}
- YY_BREAK
-case 120:
-/* rule 120 can match eol */
-YY_RULE_SETUP
-#line 1023 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 121:
-YY_RULE_SETUP
-#line 1027 "htmllex.l"
-{
- UPDATE_COLUMN;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(INITIAL);
- RETURN(T_ELEMENT_END);
-}
- YY_BREAK
-case 122:
-/* rule 122 can match eol */
-YY_RULE_SETUP
-#line 1035 "htmllex.l"
-{
- UPDATE_LINE;
-}
- YY_BREAK
-case 123:
-/* rule 123 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 1039 "htmllex.l"
-{
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(S_TAGEND);
- RETURN(T_ELEMENT_END);
-}
- YY_BREAK
-case 124:
-/* rule 124 can match eol */
-*yy_cp = yyg->yy_hold_char; /* undo effects of setting up yytext */
-YY_LINENO_REWIND_TO(yy_cp - 1);
-yyg->yy_c_buf_p = yy_cp -= 1;
-YY_DO_BEFORE_ACTION; /* set up yytext again */
-YY_RULE_SETUP
-#line 1047 "htmllex.l"
-{
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- CHECK_NULL(yyextra->tmp_attrs = PyObject_CallObject(yyextra->list_dict, NULL));
- BEGIN(S_TAGSTART);
- RETURN(T_ELEMENT_END);
-}
- YY_BREAK
-case 125:
-/* rule 125 can match eol */
-YY_RULE_SETUP
-#line 1056 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-/*********************** TEXT ************************/
-case 126:
-/* rule 126 can match eol */
-YY_RULE_SETUP
-#line 1060 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
- YY_BREAK
-case 127:
-YY_RULE_SETUP
-#line 1067 "htmllex.l"
-{
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
- YY_BREAK
-case 128:
-/* rule 128 can match eol */
-YY_RULE_SETUP
-#line 1074 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
- YY_BREAK
-case 129:
-/* rule 129 can match eol */
-YY_RULE_SETUP
-#line 1080 "htmllex.l"
-{
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
- YY_BREAK
-case 130:
-/* rule 130 can match eol */
-YY_RULE_SETUP
-#line 1087 "htmllex.l"
-{
- return T_WAIT;
-}
- YY_BREAK
-case 131:
-YY_RULE_SETUP
-#line 1091 "htmllex.l"
-ECHO;
- YY_BREAK
-#line 4711 "htmllex.c"
-
- case YY_END_OF_BUFFER:
- {
- /* Amount of text matched not including the EOB char. */
- int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
-
- /* Undo the effects of YY_DO_BEFORE_ACTION. */
- *yy_cp = yyg->yy_hold_char;
- YY_RESTORE_YY_MORE_OFFSET
-
- if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
- {
- /* We're scanning a new file or input source. It's
- * possible that this happened because the user
- * just pointed yyin at a new source and called
- * yylex(). If so, then we have to assure
- * consistency between YY_CURRENT_BUFFER and our
- * globals. Here is the right place to do so, because
- * this is the first action (other than possibly a
- * back-up) that will match for the new input source.
- */
- yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
-/* %if-c-only */
- YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
-/* %endif */
-/* %if-c++-only */
-/* %endif */
- YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
- }
-
- /* Note that here we test for yy_c_buf_p "<=" to the position
- * of the first EOB in the buffer, since yy_c_buf_p will
- * already have been incremented past the NUL character
- * (since all states make transitions on EOB to the
- * end-of-buffer state). Contrast this with the test
- * in input().
- */
- if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
- { /* This was really a NUL. */
- yy_state_type yy_next_state;
-
- yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
-
- yy_current_state = yy_get_previous_state( yyscanner );
-
- /* Okay, we're now positioned to make the NUL
- * transition. We couldn't have
- * yy_get_previous_state() go ahead and do it
- * for us because it doesn't know how to deal
- * with the possibility of jamming (and we don't
- * want to build jamming into it because then it
- * will run more slowly).
- */
-
- yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
-
- yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
-
- if ( yy_next_state )
- {
- /* Consume the NUL. */
- yy_cp = ++yyg->yy_c_buf_p;
- yy_current_state = yy_next_state;
- goto yy_match;
- }
-
- else
- {
-/* %% [14.0] code to do back-up for compressed tables and set up yy_cp goes here */
- yy_cp = yyg->yy_c_buf_p;
- goto yy_find_action;
- }
- }
-
- else switch ( yy_get_next_buffer( yyscanner ) )
- {
- case EOB_ACT_END_OF_FILE:
- {
- yyg->yy_did_buffer_switch_on_eof = 0;
-
- if ( yywrap(yyscanner ) )
- {
- /* Note: because we've taken care in
- * yy_get_next_buffer() to have set up
- * yytext, we can now set up
- * yy_c_buf_p so that if some total
- * hoser (like flex itself) wants to
- * call the scanner after we return the
- * YY_NULL, it'll still work - another
- * YY_NULL will get returned.
- */
- yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
-
- yy_act = YY_STATE_EOF(YY_START);
- goto do_action;
- }
-
- else
- {
- if ( ! yyg->yy_did_buffer_switch_on_eof )
- YY_NEW_FILE;
- }
- break;
- }
-
- case EOB_ACT_CONTINUE_SCAN:
- yyg->yy_c_buf_p =
- yyg->yytext_ptr + yy_amount_of_matched_text;
-
- yy_current_state = yy_get_previous_state( yyscanner );
-
- yy_cp = yyg->yy_c_buf_p;
- yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
- goto yy_match;
-
- case EOB_ACT_LAST_MATCH:
- yyg->yy_c_buf_p =
- &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
-
- yy_current_state = yy_get_previous_state( yyscanner );
-
- yy_cp = yyg->yy_c_buf_p;
- yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
- goto yy_find_action;
- }
- break;
- }
-
- default:
- YY_FATAL_ERROR(
- "fatal flex scanner internal error--no action found" );
- } /* end of action switch */
- } /* end of scanning one token */
- } /* end of user's declarations */
-} /* end of yylex */
-/* %ok-for-header */
-
-/* %if-c++-only */
-/* %not-for-header */
-
-/* %ok-for-header */
-
-/* %endif */
-
-/* yy_get_next_buffer - try to read in a new buffer
- *
- * Returns a code representing an action:
- * EOB_ACT_LAST_MATCH -
- * EOB_ACT_CONTINUE_SCAN - continue scanning from current position
- * EOB_ACT_END_OF_FILE - end of file
- */
-/* %if-c-only */
-static int yy_get_next_buffer (yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
- char *source = yyg->yytext_ptr;
- yy_size_t number_to_move, i;
- int ret_val;
-
- if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
- YY_FATAL_ERROR(
- "fatal flex scanner internal error--end of buffer missed" );
-
- if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
- { /* Don't try to fill the buffer, so this is an EOF. */
- if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
- {
- /* We matched a single character, the EOB, so
- * treat this as a final EOF.
- */
- return EOB_ACT_END_OF_FILE;
- }
-
- else
- {
- /* We matched some text prior to the EOB, first
- * process it.
- */
- return EOB_ACT_LAST_MATCH;
- }
- }
-
- /* Try to read more data. */
-
- /* First move last chars to start of buffer. */
- number_to_move = (yy_size_t) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1;
-
- for ( i = 0; i < number_to_move; ++i )
- *(dest++) = *(source++);
-
- if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
- /* don't do the read, it's not guaranteed to return an EOF,
- * just force an EOF
- */
- YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
-
- else
- {
- int num_to_read =
- YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
-
- while ( num_to_read <= 0 )
- { /* Not enough room in the buffer - grow it. */
-
- /* just a shorter name for the current buffer */
- YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE;
-
- int yy_c_buf_p_offset =
- (int) (yyg->yy_c_buf_p - b->yy_ch_buf);
-
- if ( b->yy_is_our_buffer )
- {
- int new_size = b->yy_buf_size * 2;
-
- if ( new_size <= 0 )
- b->yy_buf_size += b->yy_buf_size / 8;
- else
- b->yy_buf_size *= 2;
-
- b->yy_ch_buf = (char *)
- /* Include room in for 2 EOB chars. */
- yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner );
- }
- else
- /* Can't grow it, we don't own it. */
- b->yy_ch_buf = NULL;
-
- if ( ! b->yy_ch_buf )
- YY_FATAL_ERROR(
- "fatal error - scanner input buffer overflow" );
-
- yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
-
- num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
- number_to_move - 1;
-
- }
-
- if ( num_to_read > YY_READ_BUF_SIZE )
- num_to_read = YY_READ_BUF_SIZE;
-
- /* Read in more data. */
- YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
- yyg->yy_n_chars, num_to_read );
-
- YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
- }
-
- if ( yyg->yy_n_chars == 0 )
- {
- if ( number_to_move == YY_MORE_ADJ )
- {
- ret_val = EOB_ACT_END_OF_FILE;
- yyrestart(yyin ,yyscanner);
- }
-
- else
- {
- ret_val = EOB_ACT_LAST_MATCH;
- YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
- YY_BUFFER_EOF_PENDING;
- }
- }
-
- else
- ret_val = EOB_ACT_CONTINUE_SCAN;
-
- if ((int) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
- /* Extend the array by 50%, plus the number we really need. */
- int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
- YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner );
- if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
- YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
- }
-
- yyg->yy_n_chars += number_to_move;
- YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
- YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
-
- yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
-
- return ret_val;
-}
-
-/* yy_get_previous_state - get the state just before the EOB char was reached */
-
-/* %if-c-only */
-/* %not-for-header */
-
- static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- yy_state_type yy_current_state;
- char *yy_cp;
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
-/* %% [15.0] code to get the start state into yy_current_state goes here */
- yy_current_state = yyg->yy_start;
-
- for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
- {
-/* %% [16.0] code to find the next state goes here */
- yy_current_state = yy_nxt[yy_current_state][(*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1)];
- if ( yy_accept[yy_current_state] )
- {
- yyg->yy_last_accepting_state = yy_current_state;
- yyg->yy_last_accepting_cpos = yy_cp;
- }
- }
-
- return yy_current_state;
-}
-
-/* yy_try_NUL_trans - try to make a transition on the NUL character
- *
- * synopsis
- * next_state = yy_try_NUL_trans( current_state );
- */
-/* %if-c-only */
- static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- int yy_is_jam;
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
-/* %% [17.0] code to find the next state, and perhaps do backing up, goes here */
- char *yy_cp = yyg->yy_c_buf_p;
-
- yy_current_state = yy_nxt[yy_current_state][1];
- yy_is_jam = (yy_current_state <= 0);
-
- if ( ! yy_is_jam )
- {
- if ( yy_accept[yy_current_state] )
- {
- yyg->yy_last_accepting_state = yy_current_state;
- yyg->yy_last_accepting_cpos = yy_cp;
- }
- }
-
- (void)yyg;
- return yy_is_jam ? 0 : yy_current_state;
-}
-
-#ifndef YY_NO_UNPUT
-/* %if-c-only */
-
-/* %endif */
-#endif
-
-/* %if-c-only */
-#ifndef YY_NO_INPUT
-#ifdef __cplusplus
- static int yyinput (yyscan_t yyscanner)
-#else
- static int input (yyscan_t yyscanner)
-#endif
-
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- int c;
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- *yyg->yy_c_buf_p = yyg->yy_hold_char;
-
- if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
- {
- /* yy_c_buf_p now points to the character we want to return.
- * If this occurs *before* the EOB characters, then it's a
- * valid NUL; if not, then we've hit the end of the buffer.
- */
- if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
- /* This was really a NUL. */
- *yyg->yy_c_buf_p = '\0';
-
- else
- { /* need more input */
- int offset = yyg->yy_c_buf_p - yyg->yytext_ptr;
- ++yyg->yy_c_buf_p;
-
- switch ( yy_get_next_buffer( yyscanner ) )
- {
- case EOB_ACT_LAST_MATCH:
- /* This happens because yy_g_n_b()
- * sees that we've accumulated a
- * token and flags that we need to
- * try matching the token before
- * proceeding. But for input(),
- * there's no matching to consider.
- * So convert the EOB_ACT_LAST_MATCH
- * to EOB_ACT_END_OF_FILE.
- */
-
- /* Reset buffer status. */
- yyrestart(yyin ,yyscanner);
-
- /*FALLTHROUGH*/
-
- case EOB_ACT_END_OF_FILE:
- {
- if ( yywrap(yyscanner ) )
- return 0;
-
- if ( ! yyg->yy_did_buffer_switch_on_eof )
- YY_NEW_FILE;
-#ifdef __cplusplus
- return yyinput(yyscanner);
-#else
- return input(yyscanner);
-#endif
- }
-
- case EOB_ACT_CONTINUE_SCAN:
- yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
- break;
- }
- }
- }
-
- c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */
- *yyg->yy_c_buf_p = '\0'; /* preserve yytext */
- yyg->yy_hold_char = *++yyg->yy_c_buf_p;
-
-/* %% [19.0] update BOL and yylineno */
-
- return c;
-}
-/* %if-c-only */
-#endif /* ifndef YY_NO_INPUT */
-/* %endif */
-
-/** Immediately switch to a different input stream.
- * @param input_file A readable stream.
- * @param yyscanner The scanner object.
- * @note This function does not reset the start condition to @c INITIAL .
- */
-/* %if-c-only */
- void yyrestart (FILE * input_file , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- if ( ! YY_CURRENT_BUFFER ){
- yyensure_buffer_stack (yyscanner);
- YY_CURRENT_BUFFER_LVALUE =
- yy_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
- }
-
- yy_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner);
- yy_load_buffer_state(yyscanner );
-}
-
-/* %if-c++-only */
-/* %endif */
-
-/** Switch to a different input buffer.
- * @param new_buffer The new input buffer.
- * @param yyscanner The scanner object.
- */
-/* %if-c-only */
- void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- /* TODO. We should be able to replace this entire function body
- * with
- * yypop_buffer_state();
- * yypush_buffer_state(new_buffer);
- */
- yyensure_buffer_stack (yyscanner);
- if ( YY_CURRENT_BUFFER == new_buffer )
- return;
-
- if ( YY_CURRENT_BUFFER )
- {
- /* Flush out information for old buffer. */
- *yyg->yy_c_buf_p = yyg->yy_hold_char;
- YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
- YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
- }
-
- YY_CURRENT_BUFFER_LVALUE = new_buffer;
- yy_load_buffer_state(yyscanner );
-
- /* We don't actually know whether we did this switch during
- * EOF (yywrap()) processing, but the only time this flag
- * is looked at is after yywrap() is called, so it's safe
- * to go ahead and always set it.
- */
- yyg->yy_did_buffer_switch_on_eof = 1;
-}
-
-/* %if-c-only */
-static void yy_load_buffer_state (yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
- yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
-/* %if-c-only */
- yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
-/* %endif */
-/* %if-c++-only */
-/* %endif */
- yyg->yy_hold_char = *yyg->yy_c_buf_p;
-}
-
-/** Allocate and initialize an input buffer state.
- * @param file A readable stream.
- * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
- * @param yyscanner The scanner object.
- * @return the allocated buffer state.
- */
-/* %if-c-only */
- YY_BUFFER_STATE yy_create_buffer (FILE * file, int size , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- YY_BUFFER_STATE b;
-
- b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ,yyscanner );
- if ( ! b )
- YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
-
- b->yy_buf_size = (yy_size_t)size;
-
- /* yy_ch_buf has to be 2 characters longer than the size given because
- * we need to put in 2 end-of-buffer characters.
- */
- b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ,yyscanner );
- if ( ! b->yy_ch_buf )
- YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
-
- b->yy_is_our_buffer = 1;
-
- yy_init_buffer(b,file ,yyscanner);
-
- return b;
-}
-
-/* %if-c++-only */
-/* %endif */
-
-/** Destroy the buffer.
- * @param b a buffer created with yy_create_buffer()
- * @param yyscanner The scanner object.
- */
-/* %if-c-only */
- void yy_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- if ( ! b )
- return;
-
- if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
- YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
-
- if ( b->yy_is_our_buffer )
- yyfree((void *) b->yy_ch_buf ,yyscanner );
-
- yyfree((void *) b ,yyscanner );
-}
-
-/* Initializes or reinitializes a buffer.
- * This function is sometimes called more than once on the same buffer,
- * such as during a yyrestart() or at EOF.
- */
-/* %if-c-only */
- static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-
-{
- int oerrno = errno;
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- yy_flush_buffer(b ,yyscanner);
-
-/* %if-c-only */
- b->yy_input_file = file;
-/* %endif */
-/* %if-c++-only */
-/* %endif */
- b->yy_fill_buffer = 1;
-
- /* If b is the current buffer, then yy_init_buffer was _probably_
- * called from yyrestart() or through yy_get_next_buffer.
- * In that case, we don't want to reset the lineno or column.
- */
- if (b != YY_CURRENT_BUFFER){
- b->yy_bs_lineno = 1;
- b->yy_bs_column = 0;
- }
-
-/* %if-c-only */
-
- b->yy_is_interactive = 0;
-
-/* %endif */
-/* %if-c++-only */
-/* %endif */
- errno = oerrno;
-}
-
-/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
- * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
- * @param yyscanner The scanner object.
- */
-/* %if-c-only */
- void yy_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- if ( ! b )
- return;
-
- b->yy_n_chars = 0;
-
- /* We always need two end-of-buffer characters. The first causes
- * a transition to the end-of-buffer state. The second causes
- * a jam in that state.
- */
- b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
- b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
-
- b->yy_buf_pos = &b->yy_ch_buf[0];
-
- b->yy_at_bol = 1;
- b->yy_buffer_status = YY_BUFFER_NEW;
-
- if ( b == YY_CURRENT_BUFFER )
- yy_load_buffer_state(yyscanner );
-}
-
-/* %if-c-or-c++ */
-/** Pushes the new state onto the stack. The new state becomes
- * the current state. This function will allocate the stack
- * if necessary.
- * @param new_buffer The new state.
- * @param yyscanner The scanner object.
- */
-/* %if-c-only */
-void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- if (new_buffer == NULL)
- return;
-
- yyensure_buffer_stack(yyscanner);
-
- /* This block is copied from yy_switch_to_buffer. */
- if ( YY_CURRENT_BUFFER )
- {
- /* Flush out information for old buffer. */
- *yyg->yy_c_buf_p = yyg->yy_hold_char;
- YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
- YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
- }
-
- /* Only push if top exists. Otherwise, replace top. */
- if (YY_CURRENT_BUFFER)
- yyg->yy_buffer_stack_top++;
- YY_CURRENT_BUFFER_LVALUE = new_buffer;
-
- /* copied from yy_switch_to_buffer. */
- yy_load_buffer_state(yyscanner );
- yyg->yy_did_buffer_switch_on_eof = 1;
-}
-/* %endif */
-
-/* %if-c-or-c++ */
-/** Removes and deletes the top of the stack, if present.
- * The next element becomes the new top.
- * @param yyscanner The scanner object.
- */
-/* %if-c-only */
-void yypop_buffer_state (yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- if (!YY_CURRENT_BUFFER)
- return;
-
- yy_delete_buffer(YY_CURRENT_BUFFER ,yyscanner);
- YY_CURRENT_BUFFER_LVALUE = NULL;
- if (yyg->yy_buffer_stack_top > 0)
- --yyg->yy_buffer_stack_top;
-
- if (YY_CURRENT_BUFFER) {
- yy_load_buffer_state(yyscanner );
- yyg->yy_did_buffer_switch_on_eof = 1;
- }
-}
-/* %endif */
-
-/* %if-c-or-c++ */
-/* Allocates the stack if it does not exist.
- * Guarantees space for at least one push.
- */
-/* %if-c-only */
-static void yyensure_buffer_stack (yyscan_t yyscanner)
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-{
- int num_to_alloc;
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- if (!yyg->yy_buffer_stack) {
-
- /* First allocation is just for 2 elements, since we don't know if this
- * scanner will even need a stack. We use 2 instead of 1 to avoid an
- * immediate realloc on the next call.
- */
- num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
- yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc
- (num_to_alloc * sizeof(struct yy_buffer_state*)
- , yyscanner);
- if ( ! yyg->yy_buffer_stack )
- YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
-
- memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
-
- yyg->yy_buffer_stack_max = num_to_alloc;
- yyg->yy_buffer_stack_top = 0;
- return;
- }
-
- if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
-
- /* Increase the buffer to prepare for a possible push. */
- yy_size_t grow_size = 8 /* arbitrary grow size */;
-
- num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
- yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc
- (yyg->yy_buffer_stack,
- num_to_alloc * sizeof(struct yy_buffer_state*)
- , yyscanner);
- if ( ! yyg->yy_buffer_stack )
- YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
-
- /* zero only the new slots.*/
- memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
- yyg->yy_buffer_stack_max = num_to_alloc;
- }
-}
-/* %endif */
-
-/* %if-c-only */
-/** Setup the input buffer state to scan directly from a user-specified character buffer.
- * @param base the character buffer
- * @param size the size in bytes of the character buffer
- * @param yyscanner The scanner object.
- * @return the newly allocated buffer state object.
- */
-YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner)
-{
- YY_BUFFER_STATE b;
-
- if ( size < 2 ||
- base[size-2] != YY_END_OF_BUFFER_CHAR ||
- base[size-1] != YY_END_OF_BUFFER_CHAR )
- /* They forgot to leave room for the EOB's. */
- return NULL;
-
- b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ,yyscanner );
- if ( ! b )
- YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" );
-
- b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */
- b->yy_buf_pos = b->yy_ch_buf = base;
- b->yy_is_our_buffer = 0;
- b->yy_input_file = NULL;
- b->yy_n_chars = b->yy_buf_size;
- b->yy_is_interactive = 0;
- b->yy_at_bol = 1;
- b->yy_fill_buffer = 0;
- b->yy_buffer_status = YY_BUFFER_NEW;
-
- yy_switch_to_buffer(b ,yyscanner );
-
- return b;
-}
-/* %endif */
-
-/* %if-c-only */
-/** Setup the input buffer state to scan a string. The next call to yylex() will
- * scan from a @e copy of @a str.
- * @param yystr a NUL-terminated string to scan
- * @param yyscanner The scanner object.
- * @return the newly allocated buffer state object.
- * @note If you want to scan bytes that may contain NUL values, then use
- * yy_scan_bytes() instead.
- */
-YY_BUFFER_STATE yy_scan_string (yyconst char * yystr , yyscan_t yyscanner)
-{
-
- return yy_scan_bytes(yystr,(int) strlen(yystr) ,yyscanner);
-}
-/* %endif */
-
-/* %if-c-only */
-/** Setup the input buffer state to scan the given bytes. The next call to yylex() will
- * scan from a @e copy of @a bytes.
- * @param yybytes the byte buffer to scan
- * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
- * @param yyscanner The scanner object.
- * @return the newly allocated buffer state object.
- */
-YY_BUFFER_STATE yy_scan_bytes (yyconst char * yybytes, int _yybytes_len , yyscan_t yyscanner)
-{
- YY_BUFFER_STATE b;
- char *buf;
- yy_size_t n;
- yy_size_t i;
-
- /* Get memory for full buffer, including space for trailing EOB's. */
- n = (yy_size_t) _yybytes_len + 2;
- buf = (char *) yyalloc(n ,yyscanner );
- if ( ! buf )
- YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" );
-
- for ( i = 0; i < _yybytes_len; ++i )
- buf[i] = yybytes[i];
-
- buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
-
- b = yy_scan_buffer(buf,n ,yyscanner);
- if ( ! b )
- YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" );
-
- /* It's okay to grow etc. this buffer, and we should throw it
- * away when we're done.
- */
- b->yy_is_our_buffer = 1;
-
- return b;
-}
-/* %endif */
-
-#ifndef YY_EXIT_FAILURE
-#define YY_EXIT_FAILURE 2
-#endif
-
-/* %if-c-only */
-static void yynoreturn yy_fatal_error (yyconst char* msg , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- (void)yyg;
- (void) fprintf( stderr, "%s\n", msg );
- exit( YY_EXIT_FAILURE );
-}
-/* %endif */
-/* %if-c++-only */
-/* %endif */
-
-/* Redefine yyless() so it works in section 3 code. */
-
-#undef yyless
-#define yyless(n) \
- do \
- { \
- /* Undo effects of setting up yytext. */ \
- int yyless_macro_arg = (n); \
- YY_LESS_LINENO(yyless_macro_arg);\
- yytext[yyleng] = yyg->yy_hold_char; \
- yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
- yyg->yy_hold_char = *yyg->yy_c_buf_p; \
- *yyg->yy_c_buf_p = '\0'; \
- yyleng = yyless_macro_arg; \
- } \
- while ( 0 )
-
-/* Accessor methods (get/set functions) to struct members. */
-
-/* %if-c-only */
-/* %if-reentrant */
-
-/** Get the user-defined data for this scanner.
- * @param yyscanner The scanner object.
- */
-YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- return yyextra;
-}
-
-/* %endif */
-
-/** Get the current line number.
- * @param yyscanner The scanner object.
- */
-int yyget_lineno (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- if (! YY_CURRENT_BUFFER)
- return 0;
-
- return yylineno;
-}
-
-/** Get the current column number.
- * @param yyscanner The scanner object.
- */
-int yyget_column (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- if (! YY_CURRENT_BUFFER)
- return 0;
-
- return yycolumn;
-}
-
-/** Get the input stream.
- * @param yyscanner The scanner object.
- */
-FILE *yyget_in (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- return yyin;
-}
-
-/** Get the output stream.
- * @param yyscanner The scanner object.
- */
-FILE *yyget_out (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- return yyout;
-}
-
-/** Get the length of the current token.
- * @param yyscanner The scanner object.
- */
-int yyget_leng (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- return yyleng;
-}
-
-/** Get the current token.
- * @param yyscanner The scanner object.
- */
-
-char *yyget_text (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- return yytext;
-}
-
-/* %if-reentrant */
-
-/** Set the user-defined data. This data is never touched by the scanner.
- * @param user_defined The data to be associated with this scanner.
- * @param yyscanner The scanner object.
- */
-void yyset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- yyextra = user_defined ;
-}
-
-/* %endif */
-
-/** Set the current line number.
- * @param _line_number line number
- * @param yyscanner The scanner object.
- */
-void yyset_lineno (int _line_number , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- /* lineno is only valid if an input buffer exists. */
- if (! YY_CURRENT_BUFFER )
- YY_FATAL_ERROR( "yyset_lineno called with no buffer" );
-
- yylineno = _line_number;
-}
-
-/** Set the current column.
- * @param _column_no column number
- * @param yyscanner The scanner object.
- */
-void yyset_column (int _column_no , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- /* column is only valid if an input buffer exists. */
- if (! YY_CURRENT_BUFFER )
- YY_FATAL_ERROR( "yyset_column called with no buffer" );
-
- yycolumn = _column_no;
-}
-
-/** Set the input stream. This does not discard the current
- * input buffer.
- * @param _in_str A readable stream.
- * @param yyscanner The scanner object.
- * @see yy_switch_to_buffer
- */
-void yyset_in (FILE * _in_str , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- yyin = _in_str ;
-}
-
-void yyset_out (FILE * _out_str , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- yyout = _out_str ;
-}
-
-int yyget_debug (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- return yy_flex_debug;
-}
-
-void yyset_debug (int _bdebug , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- yy_flex_debug = _bdebug ;
-}
-
-/* %endif */
-
-/* %if-reentrant */
-/* Accessor methods for yylval and yylloc */
-
-/* %if-bison-bridge */
-
-YYSTYPE * yyget_lval (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- return yylval;
-}
-
-void yyset_lval (YYSTYPE * yylval_param , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- yylval = yylval_param;
-}
-
-/* %endif */
-
-/* User-visible API */
-
-/* yylex_init is special because it creates the scanner itself, so it is
- * the ONLY reentrant function that doesn't take the scanner as the last argument.
- * That's why we explicitly handle the declaration, instead of using our macros.
- */
-
-int yylex_init(yyscan_t* ptr_yy_globals)
-
-{
- if (ptr_yy_globals == NULL){
- errno = EINVAL;
- return 1;
- }
-
- *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL );
-
- if (*ptr_yy_globals == NULL){
- errno = ENOMEM;
- return 1;
- }
-
- /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
- memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
-
- return yy_init_globals ( *ptr_yy_globals );
-}
-
-/* yylex_init_extra has the same functionality as yylex_init, but follows the
- * convention of taking the scanner as the last argument. Note however, that
- * this is a *pointer* to a scanner, as it will be allocated by this call (and
- * is the reason, too, why this function also must handle its own declaration).
- * The user defined value in the first argument will be available to yyalloc in
- * the yyextra field.
- */
-
-int yylex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals )
-
-{
- struct yyguts_t dummy_yyguts;
-
- yyset_extra (yy_user_defined, &dummy_yyguts);
-
- if (ptr_yy_globals == NULL){
- errno = EINVAL;
- return 1;
- }
-
- *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
-
- if (*ptr_yy_globals == NULL){
- errno = ENOMEM;
- return 1;
- }
-
- /* By setting to 0xAA, we expose bugs in
- yy_init_globals. Leave at 0x00 for releases. */
- memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
-
- yyset_extra (yy_user_defined, *ptr_yy_globals);
-
- return yy_init_globals ( *ptr_yy_globals );
-}
-
-/* %endif if-c-only */
-
-/* %if-c-only */
-static int yy_init_globals (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- /* Initialization is the same as for the non-reentrant scanner.
- * This function is called from yylex_destroy(), so don't allocate here.
- */
-
- yyg->yy_buffer_stack = NULL;
- yyg->yy_buffer_stack_top = 0;
- yyg->yy_buffer_stack_max = 0;
- yyg->yy_c_buf_p = NULL;
- yyg->yy_init = 0;
- yyg->yy_start = 0;
-
- yyg->yy_start_stack_ptr = 0;
- yyg->yy_start_stack_depth = 0;
- yyg->yy_start_stack = NULL;
-
-/* Defined in main.c */
-#ifdef YY_STDINIT
- yyin = stdin;
- yyout = stdout;
-#else
- yyin = NULL;
- yyout = NULL;
-#endif
-
- /* For future reference: Set errno on error, since we are called by
- * yylex_init()
- */
- return 0;
-}
-/* %endif */
-
-/* %if-c-only SNIP! this currently causes conflicts with the c++ scanner */
-/* yylex_destroy is for both reentrant and non-reentrant scanners. */
-int yylex_destroy (yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-
- /* Pop the buffer stack, destroying each element. */
- while(YY_CURRENT_BUFFER){
- yy_delete_buffer(YY_CURRENT_BUFFER ,yyscanner );
- YY_CURRENT_BUFFER_LVALUE = NULL;
- yypop_buffer_state(yyscanner);
- }
-
- /* Destroy the stack itself. */
- yyfree(yyg->yy_buffer_stack ,yyscanner);
- yyg->yy_buffer_stack = NULL;
-
- /* Destroy the start condition stack. */
- yyfree(yyg->yy_start_stack ,yyscanner );
- yyg->yy_start_stack = NULL;
-
- /* Reset the globals. This is important in a non-reentrant scanner so the next time
- * yylex() is called, initialization will occur. */
- yy_init_globals( yyscanner);
-
-/* %if-reentrant */
- /* Destroy the main struct (reentrant only). */
- yyfree ( yyscanner , yyscanner );
- yyscanner = NULL;
-/* %endif */
- return 0;
-}
-/* %endif */
-
-/*
- * Internal utility routines.
- */
-
-#ifndef yytext_ptr
-static void yy_flex_strncpy (char* s1, yyconst char * s2, int n , yyscan_t yyscanner)
-{
- struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
- (void)yyg;
-
- int i;
- for ( i = 0; i < n; ++i )
- s1[i] = s2[i];
-}
-#endif
-
-#ifdef YY_NEED_STRLEN
-static int yy_flex_strlen (yyconst char * s , yyscan_t yyscanner)
-{
- int n;
- for ( n = 0; s[n]; ++n )
- ;
-
- return n;
-}
-#endif
-
-/* %if-tables-serialization definitions */
-/* %define-yytables The name for this specific scanner's tables. */
-#define YYTABLES_NAME "yytables"
-/* %endif */
-
-/* %ok-for-header */
-
-#line 1091 "htmllex.l"
-
-
-
-/* initialize the scanner */
-int htmllexInit (void** scanner, UserData* data) {
- int res;
- res = yylex_init(scanner);
- if (res) {
- return res;
- }
- yyset_extra(data,*scanner);
- return 0;
-}
-
-/* set debug level; a level > 0 enables debugging */
-int htmllexDebug (void** scanner, int debug) {
- int old = yyget_debug(*scanner);
- yyset_debug(debug,*scanner);
- return old;
-}
-
-/* prepare scanner for calls to yylex() */
-int htmllexStart (void* scanner, UserData* data, const char* s, int slen) {
- /* append s to data buffer and scan those bytes.
- As Flex does not distinguish between NUL and EOF characters,
- replace NUL with ' '. */
- size_t len = strlen(data->buf);
- int i;
- RESIZE_BUF(data->buf, len + slen + 1);
- for (i=0; i < slen; i++) {
- data->buf[len+i] = (s[i]=='\0' ? ' ' : s[i]);
- }
- data->buf[len+slen] = '\0';
- if (yyget_debug(scanner)) {
- fprintf(stderr, "SCANBUF %d `%s'\n", data->bufpos, data->buf);
- }
- if (len > data->bufpos) {
- int rewind = len - data->bufpos;
- if (yyget_debug(scanner)) {
- fprintf(stderr, "REWIND %d\n", rewind);
- }
- slen += rewind;
- len -= rewind;
- }
- /* reset userdata */
- data->bufpos = len;
- data->exc_type = NULL;
- data->exc_val = NULL;
- data->exc_tb = NULL;
- if (yyget_debug(scanner)) {
- fprintf(stderr, "SCANNING `%s'\n", data->buf + len);
- }
- data->lexbuf = yy_scan_bytes(data->buf + len,slen,scanner);
- return 0;
-}
-
-/* delete scanned buffer data */
-int htmllexStop (void* scanner, UserData* data) {
- yy_delete_buffer(data->lexbuf,scanner);
- if (data->nextpos > 0) {
- size_t len = strlen(data->buf);
- int i, j;
- for (i=data->nextpos, j=0; ibuf[j] = data->buf[i];
- }
- data->buf[j] = '\0';
- /* Can return T_ERROR, which is guaranteed to be non-zero. */
- RESIZE_BUF(data->buf, len-data->nextpos + 1);
- data->bufpos -= data->nextpos;
- data->nextpos = 0;
- }
- return 0;
-}
-
-/* destroy scanner when not needed any more */
-int htmllexDestroy (void* scanner) {
- return yylex_destroy(scanner);
-}
-
diff --git a/linkcheck/HtmlParser/htmllex.l b/linkcheck/HtmlParser/htmllex.l
deleted file mode 100644
index 1676a60d..00000000
--- a/linkcheck/HtmlParser/htmllex.l
+++ /dev/null
@@ -1,1167 +0,0 @@
-/* Copyright (C) 2000-2014 Bastian Kleineidam
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-
-/* Lexical analyzer for finding recognizable tokens in (probably
- * bad formatted) HTML streams.
- * Unrecognizable character data is passed on as a TEXT token.
- *
- * Note that you cannot rely on the "longest match" preference of
- * flex here since input data might be truncated at any given position.
- * This explains some of the more complicated lookahead rules below.
- */
-
-%{
-#include "htmlsax.h"
-#include "s_util.h"
-#include
-#include
-
-
-/* token type */
-#define YYSTYPE PyObject*
-/* type of user-specified data */
-#define YY_EXTRA_TYPE UserData*
-
-/* Returning T_ERROR is the standard error-out reaction for this lexer. */
-/* Return T_ERROR if argument is NULL. */
-#define CHECK_NULL(a) \
- if ((a) == NULL) return T_ERROR
-
-/* Return T_ERROR if argument is -1 (minus one). */
-#define CHECK_MINUSONE(a) \
- if ((a) == -1) return T_ERROR
-
-/* resize buffer b, returning T_ERROR on error */
-#define RESIZE_BUF(b, n) \
- CHECK_NULL(PyMem_Resize((b), char, (n))); \
- (b)[(n)-1] = '\0'
-
-/* make python unicode string from tmp_buf and assign it to a */
-#define PYSTRING_TMP_UNICODE(a) { \
- PyObject* pencoding; \
- char* encoding; \
- CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \
- encoding = PyBytes_AsString(pencoding); \
- if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
- (a) = PyUnicode_Decode(yyextra->tmp_buf, \
- (Py_ssize_t)strlen(yyextra->tmp_buf), \
- encoding, "ignore"); \
- Py_DECREF(pencoding); \
- CHECK_NULL(a); \
-}
-
-#define PYSTRING_TMP_ASCII(a) \
- CHECK_NULL((a) = PyUnicode_Decode(yyextra->tmp_buf, \
- (Py_ssize_t)strlen(yyextra->tmp_buf), "ascii", "ignore"))
-
-/* set return value from tmp_buf */
-#define SETLVAL_UNICODE { \
- PyObject* s; \
- PYSTRING_TMP_UNICODE(s); \
- RESIZE_BUF(yyextra->tmp_buf, 1); \
- *yylval = s; \
- }
-
-/* set return value from tmp_buf */
-#define SETLVAL_ASCII { \
- PyObject* s; \
- PYSTRING_TMP_ASCII(s); \
- RESIZE_BUF(yyextra->tmp_buf, 1); \
- *yylval = s; \
- }
-
-/* append yytext to tmp_buf */
-#define APPEND_TO_TMP(n) { \
- size_t len = strlen(yyextra->tmp_buf) + (n) + 1; \
- RESIZE_BUF(yyextra->tmp_buf, len); \
- strlcat(yyextra->tmp_buf, yytext, len); \
- }
-
-/* lowercase the tmp_buf */
-#define LOWER_TMP { \
- char* p = yyextra->tmp_buf; \
- while (*p) { *p = tolower(*p); p++; } \
- }
-
-/* check for JavaScript or CSS tags; must be before SET_ATTR_LVAL */
-#define SCRIPT_CHECK { \
- PyObject* tagname; \
- CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \
- if (strcmp("script", PyBytes_AsString(tagname))==0) \
- BEGIN(S_SCRIPT); \
- else if (strcmp("style", PyBytes_AsString(tagname))==0) \
- BEGIN(S_STYLE); \
- else \
- BEGIN(INITIAL); \
- Py_DECREF(tagname); \
- }
-
-/* set return value from tag with attributes */
-#define SET_ATTR_LVAL \
- if (yyextra->tmp_tag==NULL || yyextra->tmp_attrs==NULL) { \
- PyErr_SetString(PyExc_TypeError, "tmp_tag or tmp_attrs is NULL"); \
- return T_ERROR; \
- } \
- CHECK_NULL(*yylval = Py_BuildValue("(OO)", yyextra->tmp_tag, yyextra->tmp_attrs)); \
- yyextra->tmp_tag = yyextra->tmp_attrs = NULL
-
-/* store collected name as attribute in dictionary
- * tmp_attrname and tmp_attrval must be NULL
- */
-#define FLUSH_ATTRS \
- if (strlen(yyextra->tmp_buf) > 0) { \
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrname); \
- RESIZE_BUF(yyextra->tmp_buf, 1); \
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None)); \
- Py_CLEAR(yyextra->tmp_attrname); \
- }
-
-/* update the buffer and scanner positions */
-#define UPDATE_BUFPOS yyextra->bufpos += yyleng; yyextra->pos += yyleng
-
-/* update the column position; use this *only* in rules that cannot match
- the newline char '\n'!
- */
-#define UPDATE_COLUMN UPDATE_BUFPOS; yyextra->column += yyleng
-
-/* update the line and column position; use this in rules that can match the
- newline char '\n'.
- */
-#define UPDATE_LINE UPDATE_BUFPOS; { \
- int i; \
- for (i=0; ilineno); \
- yyextra->column = 1; \
- } \
- else ++(yyextra->column); \
- } \
-}
-
-/* return a token, setting the nextpos value back to the bufpos */
-#define RETURN(tok) yyextra->nextpos = yyextra->bufpos; return tok
-
-/* use Pythons memory management */
-void* yyalloc (yy_size_t bytes, void* yyscanner) {
- return PyMem_Malloc((size_t)bytes);
-}
-void* yyrealloc (void* ptr, yy_size_t bytes, void* yyscanner) {
- return PyMem_Realloc(ptr, (size_t)bytes);
-}
-void yyfree (void* ptr, void* yyscanner) {
- PyMem_Free(ptr);
-}
-
-/* include bison-generated token definitions */
-#include "htmlparse.h"
-%}
-
-/* use our own memory management functions (see above) */
-%option noyyalloc noyyrealloc noyyfree
-/* handle 8bit characters */
-%option 8bit
-/* define output file */
-%option outfile="htmllex.c"
-/* optimize for speed.. */
-%option align full
-/* ..but still construct equivalence classes */
-%option ecs
-/* add debugging ability */
-%option debug
-/* don't use unneeded functions */
-%option nounput nomain noyywrap noyymore noreject
-/* make it reentrant and bison compatible */
-%option bison-bridge reentrant never-interactive
-/* print warnings on compiling */
-%option warn
-
-/* scanner states */
-%x S_PI
-%x S_COMMENT
-%x S_COMMENT1
-%x S_COMMENT2
-%x S_DOCTYPE
-%x S_CDATA
-%x S_TAGSTART
-%x S_TAGEND
-%x S_TAGEND2
-%x S_SCRIPT
-%x S_SCRIPT_APOS
-%x S_SCRIPT_APOS_ESC
-%x S_SCRIPT_STRING
-%x S_SCRIPT_STRING_ESC
-%x S_SCRIPT_COMMENT
-%x S_SCRIPT_MCOMMENT
-%x S_STYLE
-%x S_ATTR1
-%x S_ATTR2
-%x S_ATTR3
-%x S_ATTR4
-%x S_ATTR5
-%x S_APOSSTRING
-%x S_APOSSTRING_ESC
-%x S_STRING
-%x S_STRING_ESC
-
-/* regular expression definitions used below */
-RX_WHITE_SPACE [\n\r\ \t\b\012]
-RX_EQUAL =
-RX_NAME [a-zA-Z]([-a-zA-Z0-9_])*
-RX_DATA [-a-zA-Z0-9_:]+
-
-%%
-
- /*********************** EOF ************************/
-<> {
- /* hit end-of-file, wait for more data */
- return T_WAIT;
-}
-
- /*********************** COMMENT ************************/
- {
- UPDATE_COLUMN;
- SETLVAL_UNICODE;
- RETURN(T_COMMENT);
-}
-
- /* Note: also accept " {
- UPDATE_COLUMN;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_COMMENT);
-}
-
---[ ]+> {
- UPDATE_COLUMN;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_COMMENT);
-}
-
--> {
- UPDATE_COLUMN;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_COMMENT);
-}
-
--/-- {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
--/[^-] {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
---/[^- >] {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
---[ ]+/[^ >] {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
-[^-]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-.|\n {
- return T_WAIT;
-}
-
- /* Note: www.nba.com had some comment */
-> {
- UPDATE_COLUMN;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_COMMENT);
-}
-
-[^>] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-
- /*********************** DOCTYPE ************************/
-> {
- UPDATE_COLUMN;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_DOCTYPE);
-}
-
-[^>]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
- /*********************** CDATA ************************/
-\]\]> {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng-3);
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_CDATA);
-}
-
-[^\]]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-\][^\]] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-\]\][^>] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-.|\n {
- return T_WAIT;
-}
-
- /*********************** PI ************************/
-<\? {
- UPDATE_COLUMN;
- BEGIN(S_PI);
-}
-
-[^?>]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-\?+> {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng-2);
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_PI);
-}
-
-\?+[^?>]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-> {
- UPDATE_COLUMN;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_PI);
-}
-
-.|\n {
- return T_WAIT;
-}
-
-
- /*********************** TAGSTART ************************/
-<{RX_WHITE_SPACE}*/[A-Za-z0-9] {
- UPDATE_LINE;
- CHECK_NULL(yyextra->tmp_attrs = PyObject_CallObject(yyextra->list_dict, NULL));
- BEGIN(S_TAGSTART);
-}
-
-[^ \t\r\n\b\012/<>]+ {
- /* actually accept a lot of tag chars, which may be illegal,
- but we dont care, it's the browsers job */
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
-{RX_WHITE_SPACE}+ {
- UPDATE_LINE;
- LOWER_TMP;
- PYSTRING_TMP_ASCII(yyextra->tmp_tag);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- BEGIN(S_ATTR1);
-}
-
-\/> {
- UPDATE_COLUMN;
- BEGIN(INITIAL);
- if (!strlen(yyextra->tmp_buf)) {
- /* the tag name was empty, assume a stray ">" */
- RESIZE_BUF(yyextra->tmp_buf, 4);
- strcpy(yyextra->tmp_buf, ">");
- yyextra->tmp_attrs = NULL;
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
- }
- LOWER_TMP;
- PYSTRING_TMP_ASCII(yyextra->tmp_tag);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START_END);
-}
-
-> {
- UPDATE_COLUMN;
- BEGIN(INITIAL);
- if (!strlen(yyextra->tmp_buf)) {
- /* the tag name was empty, assume a stray "<>" */
- RESIZE_BUF(yyextra->tmp_buf, 3);
- strcpy(yyextra->tmp_buf, "<>");
- yyextra->tmp_attrs = NULL;
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
- }
- LOWER_TMP;
- PYSTRING_TMP_ASCII(yyextra->tmp_tag);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- SCRIPT_CHECK;
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START);
-}
-
-<\/ {
- /* Abort parsing this start tag and begin an endtag. Assume
- the last "<" was a stray unquoted character. */
- char* tmp = NULL;
- UPDATE_COLUMN;
- BEGIN(S_TAGEND);
- /* Add missing "<" at beginning of buffer. */
- RESIZE_BUF(tmp, strlen(yyextra->tmp_buf)+2);
- tmp[0] = '<';
- tmp[1] = '\0';
- strlcat(tmp, yyextra->tmp_buf, sizeof(tmp));
- RESIZE_BUF(yyextra->tmp_buf, strlen(tmp)+1);
- yyextra->tmp_buf[0] = '\0';
- strlcat(yyextra->tmp_buf, tmp, sizeof(yyextra->tmp_buf));
- free(tmp);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
-
-[^/] {
- /* Abort parsing this start tag and begin a new one. Assume
- the last "<" was a stray unquoted character. */
- char* tmp = NULL;
- UPDATE_COLUMN;
- /* Add missing "<" at beginning of buffer. */
- RESIZE_BUF(tmp, strlen(yyextra->tmp_buf)+2);
- tmp[0] = '<';
- tmp[1] = '\0';
- strlcat(tmp, yyextra->tmp_buf, sizeof(tmp));
- RESIZE_BUF(yyextra->tmp_buf, strlen(tmp)+1);
- yyextra->tmp_buf[0] = '\0';
- strlcat(yyextra->tmp_buf, tmp, sizeof(yyextra->tmp_buf));
- free(tmp);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
-
-.|\n {
- return T_WAIT;
-}
-
- /*********************** SCRIPT ************************/
-<\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*> {
- UPDATE_LINE;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_SCRIPT);
-}
-
-[^/'"<]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-\' {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_APOS);
-}
-
-\" {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_STRING);
-}
-
-\/\/ {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_COMMENT);
-}
-
-\/\* {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_MCOMMENT);
-}
-
-\/[^*/] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
- /* ensure any prefix of is matched, but not itself */
-[^/] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*/[^Ss\n\r\ \t\b\012] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss]/[^Cc] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Cc]/[^Rr] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Cc][Rr]/[^Ii] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii]/[^Pp] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp]/[^Tt] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*/[^>\n\r\ \t\b\012] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-.|\n {
- return T_WAIT;
-}
-
-\\ {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_APOS_ESC);
-}
-
-[^\\']+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-\' {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
-
-.|\n {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_APOS);
-}
-
-\\ {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_STRING_ESC);
-}
-
-[^\\"]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-\" {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
-
-.|\n {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT_STRING);
-}
-
-[^\r\n<]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-[\r\n] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
-
-.|\n {
- return T_WAIT;
-}
-
-[^*]+|\* {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-\*\/ {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_SCRIPT);
-}
-
- /*********************** STYLE ************************/
-<\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*> {
- UPDATE_LINE;
- SETLVAL_UNICODE;
- BEGIN(INITIAL);
- RETURN(T_STYLE);
-}
-
-[^<]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
- /* this is so shitty */
-[^/] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*/[^Ss\n\r\ \t\b\012] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss]/[^Tt] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Tt]/[^Yy] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Tt][Yy]/[^Ll] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll]/[^Ee] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-<\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*/[^>\n\r\ \t\b\012] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-.|\n {
- return T_WAIT;
-}
-
- /*********************** ATTRS ************************/
-{RX_NAME} {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_ATTR2);
-}
-
-\/> {
- UPDATE_COLUMN;
- FLUSH_ATTRS;
- BEGIN(INITIAL);
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START_END);
-}
-
-\/[^>] {
- UPDATE_LINE;
-}
-
-\/ {
- return T_WAIT;
-}
-
-> {
- UPDATE_COLUMN;
- FLUSH_ATTRS;
- SCRIPT_CHECK;
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START);
-}
-
-{RX_DATA} {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
-\\\r?\n {
- /* Line continuations */
- UPDATE_LINE;
-}
-
-\\\r?[^\n] {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
-\\\r? {
- return T_WAIT;
-}
-
-{RX_WHITE_SPACE}+ {
- UPDATE_LINE;
- BEGIN(S_ATTR3);
-}
-
-{RX_WHITE_SPACE}*{RX_EQUAL}{RX_WHITE_SPACE}* {
- UPDATE_LINE;
- LOWER_TMP;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrname);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- BEGIN(S_ATTR4);
-}
-
-{RX_NAME} {
- UPDATE_COLUMN;
- LOWER_TMP;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrname);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- if (yyextra->tmp_attrval != NULL) return T_ERROR;
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None));
- Py_CLEAR(yyextra->tmp_attrname);
- APPEND_TO_TMP(yyleng);
- BEGIN(S_ATTR2);
-}
-
-.|\n {
- /* this also skips whitespace! */
- UPDATE_LINE;
-}
-
-\\\" {
- /* backslash escapes seen at freecode.com */
- UPDATE_COLUMN;
- BEGIN(S_STRING);
-}
-
-\" {
- UPDATE_COLUMN;
- BEGIN(S_STRING);
-}
-
-\' {
- UPDATE_COLUMN;
- BEGIN(S_APOSSTRING);
-}
-
-[^\012 \t\b\r\n>\'\"]+ {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_ATTR5);
-}
-
-> {
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- SCRIPT_CHECK;
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START);
-}
-
-{RX_WHITE_SPACE}+ {
- UPDATE_LINE;
-}
-
-[^\012 \t\b\r\n>\"]+ {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
-> {
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- SCRIPT_CHECK;
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START);
-}
-
-\/> {
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(INITIAL);
- SET_ATTR_LVAL;
- RETURN(T_ELEMENT_START_END);
-}
-
-[\"] {
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
-
-{RX_WHITE_SPACE}+ {
- UPDATE_LINE;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
-
-\\/\r?[^\n] {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_APOSSTRING_ESC);
-}
-
-\\ {
- return T_WAIT;
-}
-
-\' {
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
-
-[^\\']+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-
-.|\n {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_APOSSTRING);
-}
-
-\\?\" {
- UPDATE_COLUMN;
- PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
- RESIZE_BUF(yyextra->tmp_buf, 1);
- CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
- "O", yyextra->tmp_attrval));
- CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
- yyextra->tmp_attrval));
- Py_CLEAR(yyextra->tmp_attrname);
- Py_CLEAR(yyextra->tmp_attrval);
- BEGIN(S_ATTR1);
-}
-
-\\/\r?[^\n] {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_STRING_ESC);
-}
-
-\\\r?\n {
- UPDATE_LINE;
-}
-
-\\ {
- return T_WAIT;
-}
-
-[^\\"]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
-}
-
-.|\n {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- BEGIN(S_STRING);
-}
-
-
- /*********************** TAGEND ************************/
-<{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
- UPDATE_LINE;
- BEGIN(S_TAGEND);
-}
-
-[^<>\r\n \t\b\012]+ {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
-}
-
-{RX_WHITE_SPACE}*> {
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(INITIAL);
- RETURN(T_ELEMENT_END);
-}
-
-<{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(S_TAGEND);
- RETURN(T_ELEMENT_END);
-}
-
-<{RX_WHITE_SPACE}*/[A-Za-z] {
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- CHECK_NULL(yyextra->tmp_attrs = PyObject_CallObject(yyextra->list_dict, NULL));
- BEGIN(S_TAGSTART);
- RETURN(T_ELEMENT_END);
-}
-
-{RX_WHITE_SPACE}+ {
- UPDATE_LINE;
- /* ignore any trailing garbage of this end tag */
- BEGIN(S_TAGEND2);
-}
-
-.|\n {
- return T_WAIT;
-}
-
-> {
- UPDATE_COLUMN;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(INITIAL);
- RETURN(T_ELEMENT_END);
-}
-
-[^<>]+ {
- UPDATE_LINE;
-}
-
-<{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- BEGIN(S_TAGEND);
- RETURN(T_ELEMENT_END);
-}
-
-<{RX_WHITE_SPACE}*/[A-Za-z] {
- UPDATE_LINE;
- LOWER_TMP;
- SETLVAL_ASCII;
- CHECK_NULL(yyextra->tmp_attrs = PyObject_CallObject(yyextra->list_dict, NULL));
- BEGIN(S_TAGSTART);
- RETURN(T_ELEMENT_END);
-}
-
-.|\n {
- return T_WAIT;
-}
- /*********************** TEXT ************************/
-[^<]+ {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
-
-<[^\012 \t\b\r\nA-Za-z!?/] {
- UPDATE_COLUMN;
- APPEND_TO_TMP(yyleng);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
-
-<{RX_WHITE_SPACE}+[^A-Za-z/] {
- UPDATE_LINE;
- APPEND_TO_TMP(yyleng);
- SETLVAL_UNICODE;
- RETURN(T_TEXT);
-}
-.|\n {
- return T_WAIT;
-}
-
-%%
-
-/* initialize the scanner */
-int htmllexInit (void** scanner, UserData* data) {
- int res;
- res = yylex_init(scanner);
- if (res) {
- return res;
- }
- yyset_extra(data, *scanner);
- return 0;
-}
-
-/* set debug level; a level > 0 enables debugging */
-int htmllexDebug (void** scanner, int debug) {
- int old = yyget_debug(*scanner);
- yyset_debug(debug, *scanner);
- return old;
-}
-
-/* prepare scanner for calls to yylex() */
-int htmllexStart (void* scanner, UserData* data, const char* s, int slen) {
- /* append s to data buffer and scan those bytes.
- As Flex does not distinguish between NUL and EOF characters,
- replace NUL with ' '. */
- size_t len = strlen(data->buf);
- int i;
- RESIZE_BUF(data->buf, len + slen + 1);
- for (i=0; i < slen; i++) {
- data->buf[len+i] = (s[i]=='\0' ? ' ' : s[i]);
- }
- data->buf[len+slen] = '\0';
- if (yyget_debug(scanner)) {
- fprintf(stderr, "SCANBUF %d `%s'\n", data->bufpos, data->buf);
- }
- if (len > data->bufpos) {
- int rewind = len - data->bufpos;
- if (yyget_debug(scanner)) {
- fprintf(stderr, "REWIND %d\n", rewind);
- }
- slen += rewind;
- len -= rewind;
- }
- /* reset userdata */
- data->bufpos = len;
- data->exc_type = NULL;
- data->exc_val = NULL;
- data->exc_tb = NULL;
- if (yyget_debug(scanner)) {
- fprintf(stderr, "SCANNING `%s'\n", data->buf + len);
- }
- data->lexbuf = yy_scan_bytes(data->buf + len, slen, scanner);
- return 0;
-}
-
-/* delete scanned buffer data */
-int htmllexStop (void* scanner, UserData* data) {
- yy_delete_buffer(data->lexbuf, scanner);
- if (data->nextpos > 0) {
- size_t len = strlen(data->buf);
- int i, j;
- for (i=data->nextpos, j=0; ibuf[j] = data->buf[i];
- }
- data->buf[j] = '\0';
- /* Can return T_ERROR, which is guaranteed to be non-zero. */
- RESIZE_BUF(data->buf, len-data->nextpos + 1);
- data->bufpos -= data->nextpos;
- data->nextpos = 0;
- }
- return 0;
-}
-
-/* destroy scanner when not needed any more */
-int htmllexDestroy (void* scanner) {
- return yylex_destroy(scanner);
-}
diff --git a/linkcheck/HtmlParser/htmllib.py b/linkcheck/HtmlParser/htmllib.py
index 054357f2..75c6a4ec 100644
--- a/linkcheck/HtmlParser/htmllib.py
+++ b/linkcheck/HtmlParser/htmllib.py
@@ -87,7 +87,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
- data = data.encode(self.encoding, "ignore")
self.fd.write("" % data)
def start_element (self, tag, attrs):
@@ -102,7 +101,7 @@ class HtmlPrettyPrinter (object):
"""
self._start_element(tag, attrs, ">")
- def start_end_element (self, tag, attrs):
+ def start_end_element (self, tag, attrs, element_text=None):
"""
Print HTML start-end element.
@@ -126,14 +125,11 @@ class HtmlPrettyPrinter (object):
@type end: string
@return: None
"""
- tag = tag.encode(self.encoding, "ignore")
self.fd.write("<%s" % tag.replace("/", ""))
for key, val in attrs.items():
- key = key.encode(self.encoding, "ignore")
if val is None:
self.fd.write(" %s" % key)
else:
- val = val.encode(self.encoding, "ignore")
self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
self.fd.write(end)
@@ -145,7 +141,6 @@ class HtmlPrettyPrinter (object):
@type tag: string
@return: None
"""
- tag = tag.encode(self.encoding, "ignore")
self.fd.write("%s>" % tag)
def doctype (self, data):
@@ -156,7 +151,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
- data = data.encode(self.encoding, "ignore")
self.fd.write("" % data)
def pi (self, data):
@@ -167,7 +161,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
- data = data.encode(self.encoding, "ignore")
self.fd.write("%s?>" % data)
def cdata (self, data):
@@ -178,7 +171,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
- data = data.encode(self.encoding, "ignore")
self.fd.write("" % data)
def characters (self, data):
@@ -189,7 +181,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
- data = data.encode(self.encoding, "ignore")
self.fd.write(data)
diff --git a/linkcheck/HtmlParser/htmlparse.c b/linkcheck/HtmlParser/htmlparse.c
deleted file mode 100644
index c9386988..00000000
--- a/linkcheck/HtmlParser/htmlparse.c
+++ /dev/null
@@ -1,2495 +0,0 @@
-/* A Bison parser, made by GNU Bison 3.0.4. */
-
-/* Bison implementation for Yacc-like parsers in C
-
- Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see . */
-
-/* As a special exception, you may create a larger work that contains
- part or all of the Bison parser skeleton and distribute that work
- under terms of your choice, so long as that work isn't itself a
- parser generator using the skeleton or a modified version thereof
- as a parser skeleton. Alternatively, if you modify or redistribute
- the parser skeleton itself, you may (at your option) remove this
- special exception, which will cause the skeleton and the resulting
- Bison output files to be licensed under the GNU General Public
- License without this special exception.
-
- This special exception was added by the Free Software Foundation in
- version 2.2 of Bison. */
-
-/* C LALR(1) parser skeleton written by Richard Stallman, by
- simplifying the original so-called "semantic" parser. */
-
-/* All symbols defined below should begin with yy or YY, to avoid
- infringing on user name space. This should be done even for local
- variables, as they might otherwise be expanded by user macros.
- There are some unavoidable exceptions within include files to
- define necessary library symbols; they are noted "INFRINGES ON
- USER NAME SPACE" below. */
-
-/* Identify Bison output. */
-#define YYBISON 1
-
-/* Bison version. */
-#define YYBISON_VERSION "3.0.4"
-
-/* Skeleton name. */
-#define YYSKELETON_NAME "yacc.c"
-
-/* Pure parsers. */
-#define YYPURE 1
-
-/* Push parsers. */
-#define YYPUSH 0
-
-/* Pull parsers. */
-#define YYPULL 1
-
-
-
-
-/* Copy the first part of user declarations. */
-#line 1 "htmlparse.y" /* yacc.c:339 */
-
-/* Copyright (C) 2000-2014 Bastian Kleineidam
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-/* A SAX HTML parser. Includes Python module definition to make it
- usable for Python programs.
-*/
-#include "htmlsax.h" /* SAX interface (includes Python.h) */
-#include "structmember.h" /* Python include for object definition */
-#include
-#include
-
-/* bison type definitions */
-#define YYSTYPE PyObject*
-/* extern functions found in htmllex.l */
-extern int yylex(YYSTYPE* yylvalp, void* scanner);
-extern int htmllexInit (void** scanner, UserData* data);
-extern int htmllexDebug (void** scanner, int debug);
-extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
-extern int htmllexStop (void* scanner, UserData* data);
-extern int htmllexDestroy (void* scanner);
-extern UserData* yyget_extra(void* scanner);
-extern int yyget_lineno(void*);
-#define YYERROR_VERBOSE 1
-
-/* standard error reporting, indicating an internal error */
-static void yyerror (void *locp, char const *msg) {
- fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
-}
-
-/* Python 2/3 compatibility */
-#if PY_MAJOR_VERSION >= 3
- #define MOD_ERROR_VAL NULL
- #define MOD_SUCCESS_VAL(val) val
- #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
- #define MOD_DEF(ob, name, doc, methods) \
- static struct PyModuleDef moduledef = { \
- PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
- ob = PyModule_Create(&moduledef)
- #define PyInt_FromLong PyLong_FromLong
-#else
- #define MOD_ERROR_VAL
- #define MOD_SUCCESS_VAL(val)
- #define MOD_INIT(name) void init##name(void)
- #define MOD_DEF(ob, name, doc, methods) \
- ob = Py_InitModule3(name, methods, doc)
-#endif
-
-
-/* existing Python methods */
-
-/* parser.resolve_entities */
-static PyObject* resolve_entities;
-/* ListDict class, sorted dictionary */
-static PyObject* list_dict;
-/* set_encoding helper function */
-static PyObject* set_encoding;
-/* set_doctype helper function */
-static PyObject* set_doctype;
-/* the unicode string u'meta' */
-static PyObject* u_meta;
-
-/* macros for easier scanner state manipulation */
-
-/* clear buffer b, returning NULL on error */
-#define CLEAR_BUF(b) \
- PyMem_Resize(b, char, 1); \
- if (b == NULL) return NULL; \
- (b)[0] = '\0'
-
-/* clear buffer b, returning NULL and decref self on error */
-#define CLEAR_BUF_DECREF(self, b) \
- PyMem_Resize(b, char, 1); \
- if (b == NULL) { Py_DECREF(self); return NULL; } \
- (b)[0] = '\0'
-
-/* check an error condition and if true set error flag and goto given label */
-#define CHECK_ERROR(cond, label) \
- if (cond) { \
- error = 1; \
- goto label; \
- }
-
-/* generic Python callback macro */
-#define CALLBACK(ud, attr, format, arg, label) \
- if (PyObject_HasAttrString(ud->handler, attr) == 1) { \
- callback = PyObject_GetAttrString(ud->handler, attr); \
- CHECK_ERROR((callback == NULL), label); \
- result = PyObject_CallFunction(callback, format, arg); \
- CHECK_ERROR((result == NULL), label); \
- Py_CLEAR(callback); \
- Py_CLEAR(result); \
- }
-
-/* set old line and column */
-#define SET_OLD_LINECOL \
- ud->last_lineno = ud->lineno; \
- ud->last_column = ud->column
-
-/* parser type definition */
-typedef struct {
- PyObject_HEAD
- /* the handler object */
- PyObject* handler;
- /* the charset encoding (PyBytesObject) */
- PyObject* encoding;
- /* the document type (PyBytesObject) */
- PyObject* doctype;
- UserData* userData;
- void* scanner;
-} parser_object;
-
-/* use Pythons memory management */
-#define YYMALLOC PyMem_Malloc
-#define YYFREE PyMem_Free
-
-/* Test whether tag does not need an HTML end tag.
- @ptag: ASCII encoded Python string in lowercase (!)
- @parser: SAX parser object
- @return: < 0 on error, > 0 if HTML end tag is needed, else 0
-*/
-static int html_end_tag (PyObject* ptag, PyObject* parser) {
- PyObject* pdoctype = NULL;
- char* doctype;
- int error = 0;
- int ret = 1;
- pdoctype = PyObject_GetAttrString(parser, "doctype");
- CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
- doctype = PyBytes_AsString(pdoctype);
- CHECK_ERROR((doctype == NULL), finish_html_end_tag);
- /* check for HTML (else it's presumably XHTML) */
- if (strcmp(doctype, "HTML") == 0) {
- char* tag = PyBytes_AsString(ptag);
- CHECK_ERROR((tag == NULL), finish_html_end_tag);
- ret = strcmp(tag, "area")!=0 &&
- strcmp(tag, "base")!=0 &&
- strcmp(tag, "basefont")!=0 &&
- strcmp(tag, "br")!=0 &&
- strcmp(tag, "col")!=0 &&
- strcmp(tag, "frame")!=0 &&
- strcmp(tag, "hr")!=0 &&
- strcmp(tag, "img")!=0 &&
- strcmp(tag, "input")!=0 &&
- strcmp(tag, "isindex")!=0 &&
- strcmp(tag, "link")!=0 &&
- strcmp(tag, "meta")!=0 &&
- strcmp(tag, "param")!=0;
- }
-finish_html_end_tag:
- Py_XDECREF(pdoctype);
- if (error) {
- return -1;
- }
- return ret;
-}
-
-
-#line 237 "htmlparse.c" /* yacc.c:339 */
-
-# ifndef YY_NULLPTR
-# if defined __cplusplus && 201103L <= __cplusplus
-# define YY_NULLPTR nullptr
-# else
-# define YY_NULLPTR 0
-# endif
-# endif
-
-/* Enabling verbose error messages. */
-#ifdef YYERROR_VERBOSE
-# undef YYERROR_VERBOSE
-# define YYERROR_VERBOSE 1
-#else
-# define YYERROR_VERBOSE 0
-#endif
-
-/* In a future release of Bison, this section will be replaced
- by #include "htmlparse.h". */
-#ifndef YY_YY_HTMLPARSE_H_INCLUDED
-# define YY_YY_HTMLPARSE_H_INCLUDED
-/* Debug traces. */
-#ifndef YYDEBUG
-# define YYDEBUG 1
-#endif
-#if YYDEBUG
-extern int yydebug;
-#endif
-
-/* Token type. */
-#ifndef YYTOKENTYPE
-# define YYTOKENTYPE
- enum yytokentype
- {
- T_WAIT = 258,
- T_ERROR = 259,
- T_TEXT = 260,
- T_ELEMENT_START = 261,
- T_ELEMENT_START_END = 262,
- T_ELEMENT_END = 263,
- T_SCRIPT = 264,
- T_STYLE = 265,
- T_PI = 266,
- T_COMMENT = 267,
- T_CDATA = 268,
- T_DOCTYPE = 269
- };
-#endif
-
-/* Value type. */
-#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
-typedef int YYSTYPE;
-# define YYSTYPE_IS_TRIVIAL 1
-# define YYSTYPE_IS_DECLARED 1
-#endif
-
-
-
-int yyparse (PyObject* scanner);
-
-#endif /* !YY_YY_HTMLPARSE_H_INCLUDED */
-
-/* Copy the second part of user declarations. */
-
-#line 302 "htmlparse.c" /* yacc.c:358 */
-
-#ifdef short
-# undef short
-#endif
-
-#ifdef YYTYPE_UINT8
-typedef YYTYPE_UINT8 yytype_uint8;
-#else
-typedef unsigned char yytype_uint8;
-#endif
-
-#ifdef YYTYPE_INT8
-typedef YYTYPE_INT8 yytype_int8;
-#else
-typedef signed char yytype_int8;
-#endif
-
-#ifdef YYTYPE_UINT16
-typedef YYTYPE_UINT16 yytype_uint16;
-#else
-typedef unsigned short int yytype_uint16;
-#endif
-
-#ifdef YYTYPE_INT16
-typedef YYTYPE_INT16 yytype_int16;
-#else
-typedef short int yytype_int16;
-#endif
-
-#ifndef YYSIZE_T
-# ifdef __SIZE_TYPE__
-# define YYSIZE_T __SIZE_TYPE__
-# elif defined size_t
-# define YYSIZE_T size_t
-# elif ! defined YYSIZE_T
-# include /* INFRINGES ON USER NAME SPACE */
-# define YYSIZE_T size_t
-# else
-# define YYSIZE_T unsigned int
-# endif
-#endif
-
-#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
-
-#ifndef YY_
-# if defined YYENABLE_NLS && YYENABLE_NLS
-# if ENABLE_NLS
-# include /* INFRINGES ON USER NAME SPACE */
-# define YY_(Msgid) dgettext ("bison-runtime", Msgid)
-# endif
-# endif
-# ifndef YY_
-# define YY_(Msgid) Msgid
-# endif
-#endif
-
-#ifndef YY_ATTRIBUTE
-# if (defined __GNUC__ \
- && (2 < __GNUC__ || (__GNUC__ == 2 && 96 <= __GNUC_MINOR__))) \
- || defined __SUNPRO_C && 0x5110 <= __SUNPRO_C
-# define YY_ATTRIBUTE(Spec) __attribute__(Spec)
-# else
-# define YY_ATTRIBUTE(Spec) /* empty */
-# endif
-#endif
-
-#ifndef YY_ATTRIBUTE_PURE
-# define YY_ATTRIBUTE_PURE YY_ATTRIBUTE ((__pure__))
-#endif
-
-#ifndef YY_ATTRIBUTE_UNUSED
-# define YY_ATTRIBUTE_UNUSED YY_ATTRIBUTE ((__unused__))
-#endif
-
-#if !defined _Noreturn \
- && (!defined __STDC_VERSION__ || __STDC_VERSION__ < 201112)
-# if defined _MSC_VER && 1200 <= _MSC_VER
-# define _Noreturn __declspec (noreturn)
-# else
-# define _Noreturn YY_ATTRIBUTE ((__noreturn__))
-# endif
-#endif
-
-/* Suppress unused-variable warnings by "using" E. */
-#if ! defined lint || defined __GNUC__
-# define YYUSE(E) ((void) (E))
-#else
-# define YYUSE(E) /* empty */
-#endif
-
-#if defined __GNUC__ && 407 <= __GNUC__ * 100 + __GNUC_MINOR__
-/* Suppress an incorrect diagnostic about yylval being uninitialized. */
-# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \
- _Pragma ("GCC diagnostic push") \
- _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"")\
- _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
-# define YY_IGNORE_MAYBE_UNINITIALIZED_END \
- _Pragma ("GCC diagnostic pop")
-#else
-# define YY_INITIAL_VALUE(Value) Value
-#endif
-#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
-# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
-# define YY_IGNORE_MAYBE_UNINITIALIZED_END
-#endif
-#ifndef YY_INITIAL_VALUE
-# define YY_INITIAL_VALUE(Value) /* Nothing. */
-#endif
-
-
-#if ! defined yyoverflow || YYERROR_VERBOSE
-
-/* The parser invokes alloca or malloc; define the necessary symbols. */
-
-# ifdef YYSTACK_USE_ALLOCA
-# if YYSTACK_USE_ALLOCA
-# ifdef __GNUC__
-# define YYSTACK_ALLOC __builtin_alloca
-# elif defined __BUILTIN_VA_ARG_INCR
-# include /* INFRINGES ON USER NAME SPACE */
-# elif defined _AIX
-# define YYSTACK_ALLOC __alloca
-# elif defined _MSC_VER
-# include /* INFRINGES ON USER NAME SPACE */
-# define alloca _alloca
-# else
-# define YYSTACK_ALLOC alloca
-# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS
-# include /* INFRINGES ON USER NAME SPACE */
- /* Use EXIT_SUCCESS as a witness for stdlib.h. */
-# ifndef EXIT_SUCCESS
-# define EXIT_SUCCESS 0
-# endif
-# endif
-# endif
-# endif
-# endif
-
-# ifdef YYSTACK_ALLOC
- /* Pacify GCC's 'empty if-body' warning. */
-# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0)
-# ifndef YYSTACK_ALLOC_MAXIMUM
- /* The OS might guarantee only one guard page at the bottom of the stack,
- and a page size can be as small as 4096 bytes. So we cannot safely
- invoke alloca (N) if N exceeds 4096. Use a slightly smaller number
- to allow for a few compiler-allocated temporary stack slots. */
-# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
-# endif
-# else
-# define YYSTACK_ALLOC YYMALLOC
-# define YYSTACK_FREE YYFREE
-# ifndef YYSTACK_ALLOC_MAXIMUM
-# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
-# endif
-# if (defined __cplusplus && ! defined EXIT_SUCCESS \
- && ! ((defined YYMALLOC || defined malloc) \
- && (defined YYFREE || defined free)))
-# include /* INFRINGES ON USER NAME SPACE */
-# ifndef EXIT_SUCCESS
-# define EXIT_SUCCESS 0
-# endif
-# endif
-# ifndef YYMALLOC
-# define YYMALLOC malloc
-# if ! defined malloc && ! defined EXIT_SUCCESS
-void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
-# endif
-# endif
-# ifndef YYFREE
-# define YYFREE free
-# if ! defined free && ! defined EXIT_SUCCESS
-void free (void *); /* INFRINGES ON USER NAME SPACE */
-# endif
-# endif
-# endif
-#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
-
-
-#if (! defined yyoverflow \
- && (! defined __cplusplus \
- || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
-
-/* A type that is properly aligned for any stack member. */
-union yyalloc
-{
- yytype_int16 yyss_alloc;
- YYSTYPE yyvs_alloc;
-};
-
-/* The size of the maximum gap between one aligned stack and the next. */
-# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
-
-/* The size of an array large to enough to hold all stacks, each with
- N elements. */
-# define YYSTACK_BYTES(N) \
- ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
- + YYSTACK_GAP_MAXIMUM)
-
-# define YYCOPY_NEEDED 1
-
-/* Relocate STACK from its old location to the new one. The
- local variables YYSIZE and YYSTACKSIZE give the old and new number of
- elements in the stack, and YYPTR gives the new location of the
- stack. Advance YYPTR to a properly aligned location for the next
- stack. */
-# define YYSTACK_RELOCATE(Stack_alloc, Stack) \
- do \
- { \
- YYSIZE_T yynewbytes; \
- YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \
- Stack = &yyptr->Stack_alloc; \
- yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
- yyptr += yynewbytes / sizeof (*yyptr); \
- } \
- while (0)
-
-#endif
-
-#if defined YYCOPY_NEEDED && YYCOPY_NEEDED
-/* Copy COUNT objects from SRC to DST. The source and destination do
- not overlap. */
-# ifndef YYCOPY
-# if defined __GNUC__ && 1 < __GNUC__
-# define YYCOPY(Dst, Src, Count) \
- __builtin_memcpy (Dst, Src, (Count) * sizeof (*(Src)))
-# else
-# define YYCOPY(Dst, Src, Count) \
- do \
- { \
- YYSIZE_T yyi; \
- for (yyi = 0; yyi < (Count); yyi++) \
- (Dst)[yyi] = (Src)[yyi]; \
- } \
- while (0)
-# endif
-# endif
-#endif /* !YYCOPY_NEEDED */
-
-/* YYFINAL -- State number of the termination state. */
-#define YYFINAL 15
-/* YYLAST -- Last index in YYTABLE. */
-#define YYLAST 26
-
-/* YYNTOKENS -- Number of terminals. */
-#define YYNTOKENS 15
-/* YYNNTS -- Number of nonterminals. */
-#define YYNNTS 3
-/* YYNRULES -- Number of rules. */
-#define YYNRULES 15
-/* YYNSTATES -- Number of states. */
-#define YYNSTATES 17
-
-/* YYTRANSLATE[YYX] -- Symbol number corresponding to YYX as returned
- by yylex, with out-of-bounds checking. */
-#define YYUNDEFTOK 2
-#define YYMAXUTOK 269
-
-#define YYTRANSLATE(YYX) \
- ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
-
-/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM
- as returned by yylex, without out-of-bounds checking. */
-static const yytype_uint8 yytranslate[] =
-{
- 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 1, 2, 3, 4,
- 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-};
-
-#if YYDEBUG
- /* YYRLINE[YYN] -- Source line where rule number YYN was defined. */
-static const yytype_uint16 yyrline[] =
-{
- 0, 196, 196, 199, 204, 208, 215, 256, 304, 340,
- 359, 377, 396, 419, 443, 467
-};
-#endif
-
-#if YYDEBUG || YYERROR_VERBOSE || 0
-/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
- First, the terminals, then, starting at YYNTOKENS, nonterminals. */
-static const char *const yytname[] =
-{
- "$end", "error", "$undefined", "T_WAIT", "T_ERROR", "T_TEXT",
- "T_ELEMENT_START", "T_ELEMENT_START_END", "T_ELEMENT_END", "T_SCRIPT",
- "T_STYLE", "T_PI", "T_COMMENT", "T_CDATA", "T_DOCTYPE", "$accept",
- "elements", "element", YY_NULLPTR
-};
-#endif
-
-# ifdef YYPRINT
-/* YYTOKNUM[NUM] -- (External) token number corresponding to the
- (internal) symbol number NUM (which must be that of a token). */
-static const yytype_uint16 yytoknum[] =
-{
- 0, 256, 257, 258, 259, 260, 261, 262, 263, 264,
- 265, 266, 267, 268, 269
-};
-# endif
-
-#define YYPACT_NINF -13
-
-#define yypact_value_is_default(Yystate) \
- (!!((Yystate) == (-13)))
-
-#define YYTABLE_NINF -1
-
-#define yytable_value_is_error(Yytable_value) \
- 0
-
- /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
- STATE-NUM. */
-static const yytype_int8 yypact[] =
-{
- 12, -13, -13, -13, -13, -13, -13, -13, -13, -13,
- -13, -13, -13, 0, -13, -13, -13
-};
-
- /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
- Performed when YYTABLE does not specify something else to do. Zero
- means the default is an error. */
-static const yytype_uint8 yydefact[] =
-{
- 0, 4, 5, 15, 6, 7, 8, 13, 14, 10,
- 9, 11, 12, 0, 2, 1, 3
-};
-
- /* YYPGOTO[NTERM-NUM]. */
-static const yytype_int8 yypgoto[] =
-{
- -13, -13, -12
-};
-
- /* YYDEFGOTO[NTERM-NUM]. */
-static const yytype_int8 yydefgoto[] =
-{
- -1, 13, 14
-};
-
- /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM. If
- positive, shift that token. If negative, reduce the rule whose
- number is the opposite. If YYTABLE_NINF, syntax error. */
-static const yytype_uint8 yytable[] =
-{
- 15, 16, 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 1, 2, 3, 4, 5,
- 6, 7, 8, 9, 10, 11, 12
-};
-
-static const yytype_int8 yycheck[] =
-{
- 0, 13, -1, 3, 4, 5, 6, 7, 8, 9,
- 10, 11, 12, 13, 14, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14
-};
-
- /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
- symbol of state STATE-NUM. */
-static const yytype_uint8 yystos[] =
-{
- 0, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 12, 13, 14, 16, 17, 0, 17
-};
-
- /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */
-static const yytype_uint8 yyr1[] =
-{
- 0, 15, 16, 16, 17, 17, 17, 17, 17, 17,
- 17, 17, 17, 17, 17, 17
-};
-
- /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN. */
-static const yytype_uint8 yyr2[] =
-{
- 0, 2, 1, 2, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1
-};
-
-
-#define yyerrok (yyerrstatus = 0)
-#define yyclearin (yychar = YYEMPTY)
-#define YYEMPTY (-2)
-#define YYEOF 0
-
-#define YYACCEPT goto yyacceptlab
-#define YYABORT goto yyabortlab
-#define YYERROR goto yyerrorlab
-
-
-#define YYRECOVERING() (!!yyerrstatus)
-
-#define YYBACKUP(Token, Value) \
-do \
- if (yychar == YYEMPTY) \
- { \
- yychar = (Token); \
- yylval = (Value); \
- YYPOPSTACK (yylen); \
- yystate = *yyssp; \
- goto yybackup; \
- } \
- else \
- { \
- yyerror (scanner, YY_("syntax error: cannot back up")); \
- YYERROR; \
- } \
-while (0)
-
-/* Error token number */
-#define YYTERROR 1
-#define YYERRCODE 256
-
-
-
-/* Enable debugging if requested. */
-#if YYDEBUG
-
-# ifndef YYFPRINTF
-# include /* INFRINGES ON USER NAME SPACE */
-# define YYFPRINTF fprintf
-# endif
-
-# define YYDPRINTF(Args) \
-do { \
- if (yydebug) \
- YYFPRINTF Args; \
-} while (0)
-
-/* This macro is provided for backward compatibility. */
-#ifndef YY_LOCATION_PRINT
-# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
-#endif
-
-
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \
-do { \
- if (yydebug) \
- { \
- YYFPRINTF (stderr, "%s ", Title); \
- yy_symbol_print (stderr, \
- Type, Value, scanner); \
- YYFPRINTF (stderr, "\n"); \
- } \
-} while (0)
-
-
-/*----------------------------------------.
-| Print this symbol's value on YYOUTPUT. |
-`----------------------------------------*/
-
-static void
-yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, PyObject* scanner)
-{
- FILE *yyo = yyoutput;
- YYUSE (yyo);
- YYUSE (scanner);
- if (!yyvaluep)
- return;
-# ifdef YYPRINT
- if (yytype < YYNTOKENS)
- YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
-# endif
- YYUSE (yytype);
-}
-
-
-/*--------------------------------.
-| Print this symbol on YYOUTPUT. |
-`--------------------------------*/
-
-static void
-yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, PyObject* scanner)
-{
- YYFPRINTF (yyoutput, "%s %s (",
- yytype < YYNTOKENS ? "token" : "nterm", yytname[yytype]);
-
- yy_symbol_value_print (yyoutput, yytype, yyvaluep, scanner);
- YYFPRINTF (yyoutput, ")");
-}
-
-/*------------------------------------------------------------------.
-| yy_stack_print -- Print the state stack from its BOTTOM up to its |
-| TOP (included). |
-`------------------------------------------------------------------*/
-
-static void
-yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
-{
- YYFPRINTF (stderr, "Stack now");
- for (; yybottom <= yytop; yybottom++)
- {
- int yybot = *yybottom;
- YYFPRINTF (stderr, " %d", yybot);
- }
- YYFPRINTF (stderr, "\n");
-}
-
-# define YY_STACK_PRINT(Bottom, Top) \
-do { \
- if (yydebug) \
- yy_stack_print ((Bottom), (Top)); \
-} while (0)
-
-
-/*------------------------------------------------.
-| Report that the YYRULE is going to be reduced. |
-`------------------------------------------------*/
-
-static void
-yy_reduce_print (yytype_int16 *yyssp, YYSTYPE *yyvsp, int yyrule, PyObject* scanner)
-{
- unsigned long int yylno = yyrline[yyrule];
- int yynrhs = yyr2[yyrule];
- int yyi;
- YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
- yyrule - 1, yylno);
- /* The symbols being reduced. */
- for (yyi = 0; yyi < yynrhs; yyi++)
- {
- YYFPRINTF (stderr, " $%d = ", yyi + 1);
- yy_symbol_print (stderr,
- yystos[yyssp[yyi + 1 - yynrhs]],
- &(yyvsp[(yyi + 1) - (yynrhs)])
- , scanner);
- YYFPRINTF (stderr, "\n");
- }
-}
-
-# define YY_REDUCE_PRINT(Rule) \
-do { \
- if (yydebug) \
- yy_reduce_print (yyssp, yyvsp, Rule, scanner); \
-} while (0)
-
-/* Nonzero means print parse trace. It is left uninitialized so that
- multiple parsers can coexist. */
-int yydebug;
-#else /* !YYDEBUG */
-# define YYDPRINTF(Args)
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
-# define YY_STACK_PRINT(Bottom, Top)
-# define YY_REDUCE_PRINT(Rule)
-#endif /* !YYDEBUG */
-
-
-/* YYINITDEPTH -- initial size of the parser's stacks. */
-#ifndef YYINITDEPTH
-# define YYINITDEPTH 200
-#endif
-
-/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
- if the built-in stack extension method is used).
-
- Do not make this value too large; the results are undefined if
- YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
- evaluated with infinite-precision integer arithmetic. */
-
-#ifndef YYMAXDEPTH
-# define YYMAXDEPTH 10000
-#endif
-
-
-#if YYERROR_VERBOSE
-
-# ifndef yystrlen
-# if defined __GLIBC__ && defined _STRING_H
-# define yystrlen strlen
-# else
-/* Return the length of YYSTR. */
-static YYSIZE_T
-yystrlen (const char *yystr)
-{
- YYSIZE_T yylen;
- for (yylen = 0; yystr[yylen]; yylen++)
- continue;
- return yylen;
-}
-# endif
-# endif
-
-# ifndef yystpcpy
-# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
-# define yystpcpy stpcpy
-# else
-/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
- YYDEST. */
-static char *
-yystpcpy (char *yydest, const char *yysrc)
-{
- char *yyd = yydest;
- const char *yys = yysrc;
-
- while ((*yyd++ = *yys++) != '\0')
- continue;
-
- return yyd - 1;
-}
-# endif
-# endif
-
-# ifndef yytnamerr
-/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
- quotes and backslashes, so that it's suitable for yyerror. The
- heuristic is that double-quoting is unnecessary unless the string
- contains an apostrophe, a comma, or backslash (other than
- backslash-backslash). YYSTR is taken from yytname. If YYRES is
- null, do not copy; instead, return the length of what the result
- would have been. */
-static YYSIZE_T
-yytnamerr (char *yyres, const char *yystr)
-{
- if (*yystr == '"')
- {
- YYSIZE_T yyn = 0;
- char const *yyp = yystr;
-
- for (;;)
- switch (*++yyp)
- {
- case '\'':
- case ',':
- goto do_not_strip_quotes;
-
- case '\\':
- if (*++yyp != '\\')
- goto do_not_strip_quotes;
- /* Fall through. */
- default:
- if (yyres)
- yyres[yyn] = *yyp;
- yyn++;
- break;
-
- case '"':
- if (yyres)
- yyres[yyn] = '\0';
- return yyn;
- }
- do_not_strip_quotes: ;
- }
-
- if (! yyres)
- return yystrlen (yystr);
-
- return yystpcpy (yyres, yystr) - yyres;
-}
-# endif
-
-/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message
- about the unexpected token YYTOKEN for the state stack whose top is
- YYSSP.
-
- Return 0 if *YYMSG was successfully written. Return 1 if *YYMSG is
- not large enough to hold the message. In that case, also set
- *YYMSG_ALLOC to the required number of bytes. Return 2 if the
- required number of bytes is too large to store. */
-static int
-yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
- yytype_int16 *yyssp, int yytoken)
-{
- YYSIZE_T yysize0 = yytnamerr (YY_NULLPTR, yytname[yytoken]);
- YYSIZE_T yysize = yysize0;
- enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
- /* Internationalized format string. */
- const char *yyformat = YY_NULLPTR;
- /* Arguments of yyformat. */
- char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
- /* Number of reported tokens (one for the "unexpected", one per
- "expected"). */
- int yycount = 0;
-
- /* There are many possibilities here to consider:
- - If this state is a consistent state with a default action, then
- the only way this function was invoked is if the default action
- is an error action. In that case, don't check for expected
- tokens because there are none.
- - The only way there can be no lookahead present (in yychar) is if
- this state is a consistent state with a default action. Thus,
- detecting the absence of a lookahead is sufficient to determine
- that there is no unexpected or expected token to report. In that
- case, just report a simple "syntax error".
- - Don't assume there isn't a lookahead just because this state is a
- consistent state with a default action. There might have been a
- previous inconsistent state, consistent state with a non-default
- action, or user semantic action that manipulated yychar.
- - Of course, the expected token list depends on states to have
- correct lookahead information, and it depends on the parser not
- to perform extra reductions after fetching a lookahead from the
- scanner and before detecting a syntax error. Thus, state merging
- (from LALR or IELR) and default reductions corrupt the expected
- token list. However, the list is correct for canonical LR with
- one exception: it will still contain any token that will not be
- accepted due to an error action in a later state.
- */
- if (yytoken != YYEMPTY)
- {
- int yyn = yypact[*yyssp];
- yyarg[yycount++] = yytname[yytoken];
- if (!yypact_value_is_default (yyn))
- {
- /* Start YYX at -YYN if negative to avoid negative indexes in
- YYCHECK. In other words, skip the first -YYN actions for
- this state because they are default actions. */
- int yyxbegin = yyn < 0 ? -yyn : 0;
- /* Stay within bounds of both yycheck and yytname. */
- int yychecklim = YYLAST - yyn + 1;
- int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
- int yyx;
-
- for (yyx = yyxbegin; yyx < yyxend; ++yyx)
- if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
- && !yytable_value_is_error (yytable[yyx + yyn]))
- {
- if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
- {
- yycount = 1;
- yysize = yysize0;
- break;
- }
- yyarg[yycount++] = yytname[yyx];
- {
- YYSIZE_T yysize1 = yysize + yytnamerr (YY_NULLPTR, yytname[yyx]);
- if (! (yysize <= yysize1
- && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
- return 2;
- yysize = yysize1;
- }
- }
- }
- }
-
- switch (yycount)
- {
-# define YYCASE_(N, S) \
- case N: \
- yyformat = S; \
- break
- YYCASE_(0, YY_("syntax error"));
- YYCASE_(1, YY_("syntax error, unexpected %s"));
- YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s"));
- YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s"));
- YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s"));
- YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"));
-# undef YYCASE_
- }
-
- {
- YYSIZE_T yysize1 = yysize + yystrlen (yyformat);
- if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
- return 2;
- yysize = yysize1;
- }
-
- if (*yymsg_alloc < yysize)
- {
- *yymsg_alloc = 2 * yysize;
- if (! (yysize <= *yymsg_alloc
- && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM))
- *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM;
- return 1;
- }
-
- /* Avoid sprintf, as that infringes on the user's name space.
- Don't have undefined behavior even if the translation
- produced a string with the wrong number of "%s"s. */
- {
- char *yyp = *yymsg;
- int yyi = 0;
- while ((*yyp = *yyformat) != '\0')
- if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount)
- {
- yyp += yytnamerr (yyp, yyarg[yyi++]);
- yyformat += 2;
- }
- else
- {
- yyp++;
- yyformat++;
- }
- }
- return 0;
-}
-#endif /* YYERROR_VERBOSE */
-
-/*-----------------------------------------------.
-| Release the memory associated to this symbol. |
-`-----------------------------------------------*/
-
-static void
-yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep, PyObject* scanner)
-{
- YYUSE (yyvaluep);
- YYUSE (scanner);
- if (!yymsg)
- yymsg = "Deleting";
- YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
-
- YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
- YYUSE (yytype);
- YY_IGNORE_MAYBE_UNINITIALIZED_END
-}
-
-
-
-
-/*----------.
-| yyparse. |
-`----------*/
-
-int
-yyparse (PyObject* scanner)
-{
-/* The lookahead symbol. */
-int yychar;
-
-
-/* The semantic value of the lookahead symbol. */
-/* Default value used for initialization, for pacifying older GCCs
- or non-GCC compilers. */
-YY_INITIAL_VALUE (static YYSTYPE yyval_default;)
-YYSTYPE yylval YY_INITIAL_VALUE (= yyval_default);
-
- /* Number of syntax errors so far. */
- int yynerrs;
-
- int yystate;
- /* Number of tokens to shift before error messages enabled. */
- int yyerrstatus;
-
- /* The stacks and their tools:
- 'yyss': related to states.
- 'yyvs': related to semantic values.
-
- Refer to the stacks through separate pointers, to allow yyoverflow
- to reallocate them elsewhere. */
-
- /* The state stack. */
- yytype_int16 yyssa[YYINITDEPTH];
- yytype_int16 *yyss;
- yytype_int16 *yyssp;
-
- /* The semantic value stack. */
- YYSTYPE yyvsa[YYINITDEPTH];
- YYSTYPE *yyvs;
- YYSTYPE *yyvsp;
-
- YYSIZE_T yystacksize;
-
- int yyn;
- int yyresult;
- /* Lookahead token as an internal (translated) token number. */
- int yytoken = 0;
- /* The variables used to return semantic value and location from the
- action routines. */
- YYSTYPE yyval;
-
-#if YYERROR_VERBOSE
- /* Buffer for error messages, and its allocated size. */
- char yymsgbuf[128];
- char *yymsg = yymsgbuf;
- YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
-#endif
-
-#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N))
-
- /* The number of symbols on the RHS of the reduced rule.
- Keep to zero when no symbol should be popped. */
- int yylen = 0;
-
- yyssp = yyss = yyssa;
- yyvsp = yyvs = yyvsa;
- yystacksize = YYINITDEPTH;
-
- YYDPRINTF ((stderr, "Starting parse\n"));
-
- yystate = 0;
- yyerrstatus = 0;
- yynerrs = 0;
- yychar = YYEMPTY; /* Cause a token to be read. */
- goto yysetstate;
-
-/*------------------------------------------------------------.
-| yynewstate -- Push a new state, which is found in yystate. |
-`------------------------------------------------------------*/
- yynewstate:
- /* In all cases, when you get here, the value and location stacks
- have just been pushed. So pushing a state here evens the stacks. */
- yyssp++;
-
- yysetstate:
- *yyssp = yystate;
-
- if (yyss + yystacksize - 1 <= yyssp)
- {
- /* Get the current used size of the three stacks, in elements. */
- YYSIZE_T yysize = yyssp - yyss + 1;
-
-#ifdef yyoverflow
- {
- /* Give user a chance to reallocate the stack. Use copies of
- these so that the &'s don't force the real ones into
- memory. */
- YYSTYPE *yyvs1 = yyvs;
- yytype_int16 *yyss1 = yyss;
-
- /* Each stack pointer address is followed by the size of the
- data in use in that stack, in bytes. This used to be a
- conditional around just the two extra args, but that might
- be undefined if yyoverflow is a macro. */
- yyoverflow (YY_("memory exhausted"),
- &yyss1, yysize * sizeof (*yyssp),
- &yyvs1, yysize * sizeof (*yyvsp),
- &yystacksize);
-
- yyss = yyss1;
- yyvs = yyvs1;
- }
-#else /* no yyoverflow */
-# ifndef YYSTACK_RELOCATE
- goto yyexhaustedlab;
-# else
- /* Extend the stack our own way. */
- if (YYMAXDEPTH <= yystacksize)
- goto yyexhaustedlab;
- yystacksize *= 2;
- if (YYMAXDEPTH < yystacksize)
- yystacksize = YYMAXDEPTH;
-
- {
- yytype_int16 *yyss1 = yyss;
- union yyalloc *yyptr =
- (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
- if (! yyptr)
- goto yyexhaustedlab;
- YYSTACK_RELOCATE (yyss_alloc, yyss);
- YYSTACK_RELOCATE (yyvs_alloc, yyvs);
-# undef YYSTACK_RELOCATE
- if (yyss1 != yyssa)
- YYSTACK_FREE (yyss1);
- }
-# endif
-#endif /* no yyoverflow */
-
- yyssp = yyss + yysize - 1;
- yyvsp = yyvs + yysize - 1;
-
- YYDPRINTF ((stderr, "Stack size increased to %lu\n",
- (unsigned long int) yystacksize));
-
- if (yyss + yystacksize - 1 <= yyssp)
- YYABORT;
- }
-
- YYDPRINTF ((stderr, "Entering state %d\n", yystate));
-
- if (yystate == YYFINAL)
- YYACCEPT;
-
- goto yybackup;
-
-/*-----------.
-| yybackup. |
-`-----------*/
-yybackup:
-
- /* Do appropriate processing given the current state. Read a
- lookahead token if we need one and don't already have one. */
-
- /* First try to decide what to do without reference to lookahead token. */
- yyn = yypact[yystate];
- if (yypact_value_is_default (yyn))
- goto yydefault;
-
- /* Not known => get a lookahead token if don't already have one. */
-
- /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */
- if (yychar == YYEMPTY)
- {
- YYDPRINTF ((stderr, "Reading a token: "));
- yychar = yylex (&yylval, scanner);
- }
-
- if (yychar <= YYEOF)
- {
- yychar = yytoken = YYEOF;
- YYDPRINTF ((stderr, "Now at end of input.\n"));
- }
- else
- {
- yytoken = YYTRANSLATE (yychar);
- YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
- }
-
- /* If the proper action on seeing token YYTOKEN is to reduce or to
- detect an error, take that action. */
- yyn += yytoken;
- if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
- goto yydefault;
- yyn = yytable[yyn];
- if (yyn <= 0)
- {
- if (yytable_value_is_error (yyn))
- goto yyerrlab;
- yyn = -yyn;
- goto yyreduce;
- }
-
- /* Count tokens shifted since error; after three, turn off error
- status. */
- if (yyerrstatus)
- yyerrstatus--;
-
- /* Shift the lookahead token. */
- YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
-
- /* Discard the shifted token. */
- yychar = YYEMPTY;
-
- yystate = yyn;
- YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
- *++yyvsp = yylval;
- YY_IGNORE_MAYBE_UNINITIALIZED_END
-
- goto yynewstate;
-
-
-/*-----------------------------------------------------------.
-| yydefault -- do the default action for the current state. |
-`-----------------------------------------------------------*/
-yydefault:
- yyn = yydefact[yystate];
- if (yyn == 0)
- goto yyerrlab;
- goto yyreduce;
-
-
-/*-----------------------------.
-| yyreduce -- Do a reduction. |
-`-----------------------------*/
-yyreduce:
- /* yyn is the number of a rule to reduce with. */
- yylen = yyr2[yyn];
-
- /* If YYLEN is nonzero, implement the default value of the action:
- '$$ = $1'.
-
- Otherwise, the following line sets YYVAL to garbage.
- This behavior is undocumented and Bison
- users should not rely upon it. Assigning to YYVAL
- unconditionally makes the parser a bit smaller, and it avoids a
- GCC warning that YYVAL may be used uninitialized. */
- yyval = yyvsp[1-yylen];
-
-
- YY_REDUCE_PRINT (yyn);
- switch (yyn)
- {
- case 2:
-#line 196 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parse a single element */
-}
-#line 1389 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 3:
-#line 199 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parse a list of elements */
-}
-#line 1397 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 4:
-#line 204 "htmlparse.y" /* yacc.c:1646 */
- {
- /* wait for more lexer input */
- YYACCEPT;
-}
-#line 1406 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 5:
-#line 209 "htmlparse.y" /* yacc.c:1646 */
- {
- /* an error occured in the scanner, the python exception must be set */
- UserData* ud = yyget_extra(scanner);
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
-}
-#line 1417 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 6:
-#line 216 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML start tag (eg. )
- $1 is a PyTuple (, )
- is a PyObject, is a ListDict */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- PyObject* tag = PyTuple_GET_ITEM((yyvsp[0]), 0);
- PyObject* attrs = PyTuple_GET_ITEM((yyvsp[0]), 1);
- int error = 0;
- int cmp;
- CHECK_ERROR((tag == NULL || attrs == NULL), finish_start);
- cmp = PyObject_RichCompareBool(tag, u_meta, Py_EQ);
- CHECK_ERROR((cmp == -1), finish_start);
- if (cmp == 1) {
- /* set encoding */
- result = PyObject_CallFunction(set_encoding, "OO", ud->parser, attrs);
- CHECK_ERROR((result == NULL), finish_start);
- Py_CLEAR(result);
- }
- if (PyObject_HasAttrString(ud->handler, "start_element") == 1) {
- callback = PyObject_GetAttrString(ud->handler, "start_element");
- CHECK_ERROR((!callback), finish_start);
- result = PyObject_CallFunction(callback, "OO", tag, attrs);
- CHECK_ERROR((!result), finish_start);
- Py_CLEAR(callback);
- Py_CLEAR(result);
- }
-finish_start:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_XDECREF(tag);
- Py_XDECREF(attrs);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1462 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 7:
-#line 257 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML start-end tag (eg.
)
- $1 is a PyTuple (, )
- is a PyObject, is a ListDict */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- PyObject* tag = PyTuple_GET_ITEM((yyvsp[0]), 0);
- PyObject* attrs = PyTuple_GET_ITEM((yyvsp[0]), 1);
- int error = 0;
- int cmp;
- char* fname;
- PyObject* tagname;
- CHECK_ERROR((tag == NULL || attrs == NULL), finish_start_end);
- tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
- CHECK_ERROR((tagname == NULL), finish_start_end);
- cmp = PyObject_RichCompareBool(tag, u_meta, Py_EQ);
- CHECK_ERROR((cmp == -1), finish_start_end);
- if (cmp == 1) {
- /* set encoding */
- result = PyObject_CallFunction(set_encoding, "OO", ud->parser, attrs);
- CHECK_ERROR((result == NULL), finish_start_end);
- Py_CLEAR(result);
- }
- cmp = html_end_tag(tagname, ud->parser);
- CHECK_ERROR((cmp < 0), finish_start_end);
- fname = (cmp == 0 ? "start_element" : "start_end_element");
- if (PyObject_HasAttrString(ud->handler, fname) == 1) {
- callback = PyObject_GetAttrString(ud->handler, fname);
- CHECK_ERROR((!callback), finish_start_end);
- result = PyObject_CallFunction(callback, "OO", tag, attrs);
- CHECK_ERROR((!result), finish_start_end);
- Py_CLEAR(callback);
- Py_CLEAR(result);
- }
-finish_start_end:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_XDECREF(tag);
- Py_XDECREF(attrs);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1514 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 8:
-#line 305 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML end tag (eg. )
- $1 is a PyUnicode with the tag name */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- int cmp;
- /* encode tagname in ASCII, ignoring any unknown chars */
- PyObject* tagname = PyUnicode_AsEncodedString((yyvsp[0]), "ascii", "ignore");
- if (tagname == NULL) {
- error = 1;
- goto finish_end;
- }
- cmp = html_end_tag(tagname, ud->parser);
- CHECK_ERROR((cmp < 0), finish_end);
- if (PyObject_HasAttrString(ud->handler, "end_element") == 1 && cmp > 0) {
- callback = PyObject_GetAttrString(ud->handler, "end_element");
- CHECK_ERROR((callback == NULL), finish_end);
- result = PyObject_CallFunction(callback, "O", (yyvsp[0]));
- CHECK_ERROR((result == NULL), finish_end);
- Py_CLEAR(callback);
- Py_CLEAR(result);
- }
-finish_end:
- Py_XDECREF(tagname);
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1554 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 9:
-#line 341 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML comment (eg. )
- $1 is a PyUnicode with the comment content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "comment", "O", (yyvsp[0]), finish_comment);
-finish_comment:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1577 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 10:
-#line 360 "htmlparse.y" /* yacc.c:1646 */
- {
- /* $1 is a PyUnicode */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "pi", "O", (yyvsp[0]), finish_pi);
-finish_pi:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1599 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 11:
-#line 378 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML CDATA (eg. )
- $1 is a PyUnicode with the CDATA content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "cdata", "O", (yyvsp[0]), finish_cdata);
-finish_cdata:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1622 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 12:
-#line 397 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML doctype (eg. )
- $1 is a PyUnicode with the doctype content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- /* set encoding */
- result = PyObject_CallFunction(set_doctype, "OO", ud->parser, (yyvsp[0]));
- CHECK_ERROR((result == NULL), finish_doctype);
- Py_CLEAR(result);
- CALLBACK(ud, "doctype", "O", (yyvsp[0]), finish_doctype);
-finish_doctype:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1649 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 13:
-#line 420 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML script content (plus end tag which is omitted)
- $1 is a PyUnicode with the script content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
- CHECK_ERROR((script == NULL), finish_script);
- CALLBACK(ud, "characters", "O", (yyvsp[0]), finish_script);
- /* emit the omitted end tag */
- CALLBACK(ud, "end_element", "O", script, finish_script);
-finish_script:
- Py_XDECREF(callback);
- Py_XDECREF(script);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1677 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 14:
-#line 444 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML style content (plus end tag which is omitted)
- $1 is a PyUnicode with the style content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
- CHECK_ERROR((style == NULL), finish_style);
- CALLBACK(ud, "characters", "O", (yyvsp[0]), finish_style);
- /* emit the omitted end tag */
- CALLBACK(ud, "end_element", "O", style, finish_style);
-finish_style:
- Py_XDECREF(callback);
- Py_XDECREF(style);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1705 "htmlparse.c" /* yacc.c:1646 */
- break;
-
- case 15:
-#line 468 "htmlparse.y" /* yacc.c:1646 */
- {
- /* parsed HTML text data
- $1 is a PyUnicode with the text */
- /* Remember this is also called as a lexer fallback when no
- HTML structure element could be recognized. */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "characters", "O", (yyvsp[0]), finish_characters);
-finish_characters:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF((yyvsp[0]));
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-#line 1730 "htmlparse.c" /* yacc.c:1646 */
- break;
-
-
-#line 1734 "htmlparse.c" /* yacc.c:1646 */
- default: break;
- }
- /* User semantic actions sometimes alter yychar, and that requires
- that yytoken be updated with the new translation. We take the
- approach of translating immediately before every use of yytoken.
- One alternative is translating here after every semantic action,
- but that translation would be missed if the semantic action invokes
- YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or
- if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an
- incorrect destructor might then be invoked immediately. In the
- case of YYERROR or YYBACKUP, subsequent parser actions might lead
- to an incorrect destructor call or verbose syntax error message
- before the lookahead is translated. */
- YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
-
- YYPOPSTACK (yylen);
- yylen = 0;
- YY_STACK_PRINT (yyss, yyssp);
-
- *++yyvsp = yyval;
-
- /* Now 'shift' the result of the reduction. Determine what state
- that goes to, based on the state we popped back to and the rule
- number reduced by. */
-
- yyn = yyr1[yyn];
-
- yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
- if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
- yystate = yytable[yystate];
- else
- yystate = yydefgoto[yyn - YYNTOKENS];
-
- goto yynewstate;
-
-
-/*--------------------------------------.
-| yyerrlab -- here on detecting error. |
-`--------------------------------------*/
-yyerrlab:
- /* Make sure we have latest lookahead translation. See comments at
- user semantic actions for why this is necessary. */
- yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar);
-
- /* If not already recovering from an error, report this error. */
- if (!yyerrstatus)
- {
- ++yynerrs;
-#if ! YYERROR_VERBOSE
- yyerror (scanner, YY_("syntax error"));
-#else
-# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \
- yyssp, yytoken)
- {
- char const *yymsgp = YY_("syntax error");
- int yysyntax_error_status;
- yysyntax_error_status = YYSYNTAX_ERROR;
- if (yysyntax_error_status == 0)
- yymsgp = yymsg;
- else if (yysyntax_error_status == 1)
- {
- if (yymsg != yymsgbuf)
- YYSTACK_FREE (yymsg);
- yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc);
- if (!yymsg)
- {
- yymsg = yymsgbuf;
- yymsg_alloc = sizeof yymsgbuf;
- yysyntax_error_status = 2;
- }
- else
- {
- yysyntax_error_status = YYSYNTAX_ERROR;
- yymsgp = yymsg;
- }
- }
- yyerror (scanner, yymsgp);
- if (yysyntax_error_status == 2)
- goto yyexhaustedlab;
- }
-# undef YYSYNTAX_ERROR
-#endif
- }
-
-
-
- if (yyerrstatus == 3)
- {
- /* If just tried and failed to reuse lookahead token after an
- error, discard it. */
-
- if (yychar <= YYEOF)
- {
- /* Return failure if at end of input. */
- if (yychar == YYEOF)
- YYABORT;
- }
- else
- {
- yydestruct ("Error: discarding",
- yytoken, &yylval, scanner);
- yychar = YYEMPTY;
- }
- }
-
- /* Else will try to reuse lookahead token after shifting the error
- token. */
- goto yyerrlab1;
-
-
-/*---------------------------------------------------.
-| yyerrorlab -- error raised explicitly by YYERROR. |
-`---------------------------------------------------*/
-yyerrorlab:
-
- /* Pacify compilers like GCC when the user code never invokes
- YYERROR and the label yyerrorlab therefore never appears in user
- code. */
- if (/*CONSTCOND*/ 0)
- goto yyerrorlab;
-
- /* Do not reclaim the symbols of the rule whose action triggered
- this YYERROR. */
- YYPOPSTACK (yylen);
- yylen = 0;
- YY_STACK_PRINT (yyss, yyssp);
- yystate = *yyssp;
- goto yyerrlab1;
-
-
-/*-------------------------------------------------------------.
-| yyerrlab1 -- common code for both syntax error and YYERROR. |
-`-------------------------------------------------------------*/
-yyerrlab1:
- yyerrstatus = 3; /* Each real token shifted decrements this. */
-
- for (;;)
- {
- yyn = yypact[yystate];
- if (!yypact_value_is_default (yyn))
- {
- yyn += YYTERROR;
- if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
- {
- yyn = yytable[yyn];
- if (0 < yyn)
- break;
- }
- }
-
- /* Pop the current state because it cannot handle the error token. */
- if (yyssp == yyss)
- YYABORT;
-
-
- yydestruct ("Error: popping",
- yystos[yystate], yyvsp, scanner);
- YYPOPSTACK (1);
- yystate = *yyssp;
- YY_STACK_PRINT (yyss, yyssp);
- }
-
- YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
- *++yyvsp = yylval;
- YY_IGNORE_MAYBE_UNINITIALIZED_END
-
-
- /* Shift the error token. */
- YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
-
- yystate = yyn;
- goto yynewstate;
-
-
-/*-------------------------------------.
-| yyacceptlab -- YYACCEPT comes here. |
-`-------------------------------------*/
-yyacceptlab:
- yyresult = 0;
- goto yyreturn;
-
-/*-----------------------------------.
-| yyabortlab -- YYABORT comes here. |
-`-----------------------------------*/
-yyabortlab:
- yyresult = 1;
- goto yyreturn;
-
-#if !defined yyoverflow || YYERROR_VERBOSE
-/*-------------------------------------------------.
-| yyexhaustedlab -- memory exhaustion comes here. |
-`-------------------------------------------------*/
-yyexhaustedlab:
- yyerror (scanner, YY_("memory exhausted"));
- yyresult = 2;
- /* Fall through. */
-#endif
-
-yyreturn:
- if (yychar != YYEMPTY)
- {
- /* Make sure we have latest lookahead translation. See comments at
- user semantic actions for why this is necessary. */
- yytoken = YYTRANSLATE (yychar);
- yydestruct ("Cleanup: discarding lookahead",
- yytoken, &yylval, scanner);
- }
- /* Do not reclaim the symbols of the rule whose action triggered
- this YYABORT or YYACCEPT. */
- YYPOPSTACK (yylen);
- YY_STACK_PRINT (yyss, yyssp);
- while (yyssp != yyss)
- {
- yydestruct ("Cleanup: popping",
- yystos[*yyssp], yyvsp, scanner);
- YYPOPSTACK (1);
- }
-#ifndef yyoverflow
- if (yyss != yyssa)
- YYSTACK_FREE (yyss);
-#endif
-#if YYERROR_VERBOSE
- if (yymsg != yymsgbuf)
- YYSTACK_FREE (yymsg);
-#endif
- return yyresult;
-}
-#line 490 "htmlparse.y" /* yacc.c:1906 */
-
-
-/* create parser object */
-static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
- parser_object* self;
- if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL) {
- return NULL;
- }
- Py_INCREF(Py_None);
- self->handler = Py_None;
- /* reset userData */
- self->userData = PyMem_New(UserData, sizeof(UserData));
- if (self->userData == NULL) {
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->userData->handler = self->handler;
- self->userData->buf = NULL;
- CLEAR_BUF_DECREF(self, self->userData->buf);
- self->userData->nextpos = 0;
- self->userData->bufpos = 0;
- self->userData->pos = 0;
- self->userData->column = 1;
- self->userData->last_column = 1;
- self->userData->lineno = 1;
- self->userData->last_lineno = 1;
- self->userData->tmp_buf = NULL;
- CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
- self->userData->tmp_tag = self->userData->tmp_attrname =
- self->userData->tmp_attrval = self->userData->tmp_attrs =
- self->userData->lexbuf = NULL;
- self->userData->resolve_entities = resolve_entities;
- self->userData->list_dict = list_dict;
- self->userData->exc_type = NULL;
- self->userData->exc_val = NULL;
- self->userData->exc_tb = NULL;
- self->scanner = NULL;
- if (htmllexInit(&(self->scanner), self->userData)!=0) {
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->encoding = PyBytes_FromString("iso8859-1");
- if (self->encoding == NULL) {
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->doctype = PyBytes_FromString("HTML");
- if (self->doctype == NULL) {
- Py_DECREF(self->encoding);
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->userData->parser = (PyObject*)self;
- return (PyObject*) self;
-}
-
-
-/* initialize parser object */
-static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
- PyObject* handler = NULL;
- static char *kwlist[] = {"handler", NULL};
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
- return -1;
- }
- if (handler == NULL) {
- return 0;
- }
- Py_DECREF(self->handler);
- Py_INCREF(handler);
- self->handler = handler;
- self->userData->handler = self->handler;
- return 0;
-}
-
-
-/* traverse all used subobjects participating in reference cycles */
-static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
- Py_VISIT(self->handler);
- return 0;
-}
-
-
-/* clear all used subobjects participating in reference cycles */
-static int parser_clear (parser_object* self) {
- self->userData->handler = NULL;
- Py_CLEAR(self->handler);
- return 0;
-}
-
-
-/* free all allocated resources of parser object */
-static void parser_dealloc (parser_object* self) {
- htmllexDestroy(self->scanner);
- parser_clear(self);
- self->userData->parser = NULL;
- Py_CLEAR(self->encoding);
- Py_CLEAR(self->doctype);
- PyMem_Del(self->userData->buf);
- PyMem_Del(self->userData->tmp_buf);
- PyMem_Del(self->userData);
- Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-
-/* feed a chunk of data to the parser */
-static PyObject* parser_feed (parser_object* self, PyObject* args) {
- /* set up the parse string */
- int slen = 0;
- char* s = NULL;
- if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
- PyErr_SetString(PyExc_TypeError, "string arg required");
- return NULL;
- }
- /* parse */
- if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not start scanner");
- return NULL;
- }
- if (yyparse(self->scanner)!=0) {
- if (self->userData->exc_type!=NULL) {
- /* note: we give away these objects, so don't decref */
- PyErr_Restore(self->userData->exc_type,
- self->userData->exc_val,
- self->userData->exc_tb);
- }
- htmllexStop(self->scanner, self->userData);
- return NULL;
- }
- if (htmllexStop(self->scanner, self->userData)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
- return NULL;
- }
- Py_RETURN_NONE;
-}
-
-
-/* flush all parser buffers */
-static PyObject* parser_flush (parser_object* self, PyObject* args) {
- int res = 0;
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- /* reset parser variables */
- CLEAR_BUF(self->userData->tmp_buf);
- Py_CLEAR(self->userData->tmp_tag);
- Py_CLEAR(self->userData->tmp_attrs);
- Py_CLEAR(self->userData->tmp_attrval);
- Py_CLEAR(self->userData->tmp_attrname);
- self->userData->bufpos = 0;
- if (strlen(self->userData->buf)) {
- int error = 0;
- int i;
- PyObject* callback = NULL;
- PyObject* result = NULL;
- const char* enc;
- PyObject* s;
- /* set line, col */
- for (i=0; iuserData->buf); ++i) {
- if (self->userData->buf[i] == '\n') {
- ++(self->userData->lineno);
- self->userData->column = 1;
- }
- else ++(self->userData->column);
- }
- enc = PyBytes_AsString(self->encoding);
- s = PyUnicode_Decode(self->userData->buf,
- (Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
- /* reset buffer */
- CLEAR_BUF(self->userData->buf);
- if (s == NULL) { error = 1; goto finish_flush; }
- if (PyObject_HasAttrString(self->handler, "characters") == 1) {
- callback = PyObject_GetAttrString(self->handler, "characters");
- if (callback == NULL) { error = 1; goto finish_flush; }
- result = PyObject_CallFunction(callback, "O", s);
- if (result == NULL) { error = 1; goto finish_flush; }
- }
- finish_flush:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_XDECREF(s);
- if (error == 1) {
- return NULL;
- }
- }
- if (htmllexDestroy(self->scanner)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
- return NULL;
- }
- self->scanner = NULL;
- if (htmllexInit(&(self->scanner), self->userData)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
- return NULL;
- }
- return Py_BuildValue("i", res);
-}
-
-
-/* return the current parser line number */
-static PyObject* parser_lineno (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->lineno);
-}
-
-
-/* return the last parser line number */
-static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->last_lineno);
-}
-
-
-/* return the current parser column number */
-static PyObject* parser_column (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->column);
-}
-
-
-/* return the last parser column number */
-static PyObject* parser_last_column (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->last_column);
-}
-
-
-/* return the parser position in data stream */
-static PyObject* parser_pos (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->pos);
-}
-
-
-/* return buffered parser data up to given length */
-static PyObject* parser_peek (parser_object* self, PyObject* args) {
- Py_ssize_t len, buflen;
- if (!PyArg_ParseTuple(args, "n", &len)) {
- return NULL;
- }
- if (len < 0) {
- PyErr_SetString(PyExc_TypeError, "peek length must not be negative");
- return NULL;
- }
- buflen = strlen(self->userData->buf);
- if (!buflen || self->userData->bufpos >= buflen) {
- return PyBytes_FromString("");
- }
- if (self->userData->bufpos + len >= buflen) {
- len = buflen - self->userData->bufpos - 1;
- }
- return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
-}
-
-
-/* reset the parser. This will erase all buffered data! */
-static PyObject* parser_reset (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- if (htmllexDestroy(self->scanner)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
- return NULL;
- }
- /* reset buffer */
- CLEAR_BUF(self->userData->buf);
- CLEAR_BUF(self->userData->tmp_buf);
- self->userData->bufpos =
- self->userData->pos =
- self->userData->nextpos = 0;
- self->userData->column =
- self->userData->last_column =
- self->userData->lineno =
- self->userData->last_lineno = 1;
- self->userData->tmp_tag = self->userData->tmp_attrs =
- self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
- self->scanner = NULL;
- if (htmllexInit(&(self->scanner), self->userData)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
- return NULL;
- }
- Py_RETURN_NONE;
-}
-
-
-/* set the debug level, if its >0, debugging is on, =0 means off */
-static PyObject* parser_debug (parser_object* self, PyObject* args) {
- int debug;
- if (!PyArg_ParseTuple(args, "i", &debug)) {
- return NULL;
- }
- yydebug = debug;
- debug = htmllexDebug(&(self->scanner), debug);
- return PyInt_FromLong((long)debug);
-}
-
-
-/* get SAX handler object */
-static PyObject* parser_gethandler (parser_object* self, void* closure) {
- Py_INCREF(self->handler);
- return self->handler;
-}
-
-
-/* set SAX handler object */
-static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
- if (value == NULL) {
- PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
- return -1;
- }
- Py_DECREF(self->handler);
- Py_INCREF(value);
- self->handler = value;
- self->userData->handler = value;
- return 0;
-}
-
-
-/* get parser encoding */
-static PyObject* parser_getencoding (parser_object* self, void* closure) {
- Py_INCREF(self->encoding);
- return self->encoding;
-}
-
-
-/* set parser encoding */
-static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
- if (value == NULL) {
- PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
- return -1;
- }
- if (!PyBytes_CheckExact(value)) {
- PyErr_SetString(PyExc_TypeError, "encoding must be string");
- return -1;
- }
- Py_DECREF(self->encoding);
- Py_INCREF(value);
- self->encoding = value;
- if (yydebug > 0) {
- /* print debug message */
- PyObject* repr = PyObject_Repr(value);
- if (repr == NULL) {
- return -1;
- }
- fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
- Py_DECREF(repr);
- }
- return 0;
-}
-
-
-/* get parser doctype */
-static PyObject* parser_getdoctype (parser_object* self, void* closure) {
- Py_INCREF(self->doctype);
- return self->doctype;
-}
-
-
-/* set parser doctype */
-static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
- if (value == NULL) {
- PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
- return -1;
- }
- if (!PyBytes_CheckExact(value)) {
- PyObject* repr = PyObject_Repr(value);
- char* cp = PyBytes_AsString(repr);
- if (NULL == cp)
- return -1;
- PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
- return -1;
- }
- Py_DECREF(self->doctype);
- Py_INCREF(value);
- self->doctype = value;
- return 0;
-}
-
-
-/* type interface */
-
-static PyMemberDef parser_members[] = {
- {NULL} /* Sentinel */
-};
-
-static PyGetSetDef parser_getset[] = {
- {"handler", (getter)parser_gethandler, (setter)parser_sethandler,
- "handler object", NULL},
- {"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
- "encoding", NULL},
- {"doctype", (getter)parser_getdoctype, (setter)parser_setdoctype,
- "doctype", NULL},
- {NULL} /* Sentinel */
-};
-
-static PyMethodDef parser_methods[] = {
- {"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
- {"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
- {"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
- {"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
- {"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
- {"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
- {"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
- {"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
- {"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
- {"peek", (PyCFunction)parser_peek, METH_VARARGS, "get up to given length of buffered data from current parse position"},
- {NULL} /* Sentinel */
-};
-
-
-static PyTypeObject parser_type = {
- PyVarObject_HEAD_INIT(NULL, 0)
- "linkcheck.HtmlParser.htmlsax.parser", /* tp_name */
- sizeof(parser_object), /* tp_size */
- 0, /* tp_itemsize */
- /* methods */
- (destructor)parser_dealloc, /* tp_dealloc */
- 0, /* tp_print */
- 0, /* tp_getattr */
- 0, /* tp_setattr */
- 0, /* tp_compare */
- 0, /* tp_repr */
- 0, /* tp_as_number */
- 0, /* tp_as_sequence */
- 0, /* tp_as_mapping */
- 0, /* tp_hash */
- 0, /* tp_call */
- 0, /* tp_str */
- 0, /* tp_getattro */
- 0, /* tp_setattro */
- 0, /* tp_as_buffer */
- Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
- Py_TPFLAGS_HAVE_GC, /* tp_flags */
- "HTML parser object", /* tp_doc */
- (traverseproc)parser_traverse, /* tp_traverse */
- (inquiry)parser_clear, /* tp_clear */
- 0, /* tp_richcompare */
- 0, /* tp_weaklistoffset */
- 0, /* tp_iter */
- 0, /* tp_iternext */
- parser_methods, /* tp_methods */
- parser_members, /* tp_members */
- parser_getset, /* tp_getset */
- 0, /* tp_base */
- 0, /* tp_dict */
- 0, /* tp_descr_get */
- 0, /* tp_descr_set */
- 0, /* tp_dictoffset */
- (initproc)parser_init, /* tp_init */
- 0, /* tp_alloc */
- parser_new, /* tp_new */
- 0, /* tp_free */
- 0, /* tp_is_gc */
- 0, /* tp_bases */
- 0, /* tp_mro */
- 0, /* tp_cache */
- 0, /* tp_subclasses */
- 0, /* tp_weaklist */
- 0, /* tp_del */
-};
-
-
-static PyMethodDef htmlsax_methods[] = {
- {NULL} /* Sentinel */
-};
-
-
-/* initialization of the htmlsax module */
-MOD_INIT(htmlsax) {
- PyObject* m = NULL;
- MOD_DEF(m, "htmlsax", "SAX HTML parser routines", htmlsax_methods);
- if (m == NULL) {
- return MOD_ERROR_VAL;
- }
- if (PyType_Ready(&parser_type) < 0) {
- return MOD_ERROR_VAL;
- }
- Py_INCREF(&parser_type);
- if (PyModule_AddObject(m, "parser", (PyObject*)&parser_type) == -1) {
- /* init error */
- PyErr_Print();
- }
- PyObject* h = NULL;
- if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
- return MOD_ERROR_VAL;
- }
- if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
- Py_DECREF(resolve_entities);
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
- Py_DECREF(resolve_entities);
- Py_DECREF(set_encoding);
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- Py_DECREF(h);
- if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
- return MOD_ERROR_VAL;
- }
- if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
- return MOD_ERROR_VAL;
- }
- if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- Py_DECREF(h);
- return MOD_SUCCESS_VAL(m);
-}
diff --git a/linkcheck/HtmlParser/htmlparse.h b/linkcheck/HtmlParser/htmlparse.h
deleted file mode 100644
index 36af12cc..00000000
--- a/linkcheck/HtmlParser/htmlparse.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* A Bison parser, made by GNU Bison 3.0.4. */
-
-/* Bison interface for Yacc-like parsers in C
-
- Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see . */
-
-/* As a special exception, you may create a larger work that contains
- part or all of the Bison parser skeleton and distribute that work
- under terms of your choice, so long as that work isn't itself a
- parser generator using the skeleton or a modified version thereof
- as a parser skeleton. Alternatively, if you modify or redistribute
- the parser skeleton itself, you may (at your option) remove this
- special exception, which will cause the skeleton and the resulting
- Bison output files to be licensed under the GNU General Public
- License without this special exception.
-
- This special exception was added by the Free Software Foundation in
- version 2.2 of Bison. */
-
-#ifndef YY_YY_HTMLPARSE_H_INCLUDED
-# define YY_YY_HTMLPARSE_H_INCLUDED
-/* Debug traces. */
-#ifndef YYDEBUG
-# define YYDEBUG 1
-#endif
-#if YYDEBUG
-extern int yydebug;
-#endif
-
-/* Token type. */
-#ifndef YYTOKENTYPE
-# define YYTOKENTYPE
- enum yytokentype
- {
- T_WAIT = 258,
- T_ERROR = 259,
- T_TEXT = 260,
- T_ELEMENT_START = 261,
- T_ELEMENT_START_END = 262,
- T_ELEMENT_END = 263,
- T_SCRIPT = 264,
- T_STYLE = 265,
- T_PI = 266,
- T_COMMENT = 267,
- T_CDATA = 268,
- T_DOCTYPE = 269
- };
-#endif
-
-/* Value type. */
-#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
-typedef int YYSTYPE;
-# define YYSTYPE_IS_TRIVIAL 1
-# define YYSTYPE_IS_DECLARED 1
-#endif
-
-
-
-int yyparse (PyObject* scanner);
-
-#endif /* !YY_YY_HTMLPARSE_H_INCLUDED */
diff --git a/linkcheck/HtmlParser/htmlparse.y b/linkcheck/HtmlParser/htmlparse.y
deleted file mode 100644
index 4fec7b16..00000000
--- a/linkcheck/HtmlParser/htmlparse.y
+++ /dev/null
@@ -1,1023 +0,0 @@
-%{
-/* Copyright (C) 2000-2014 Bastian Kleineidam
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-/* A SAX HTML parser. Includes Python module definition to make it
- usable for Python programs.
-*/
-#include "htmlsax.h" /* SAX interface (includes Python.h) */
-#include "structmember.h" /* Python include for object definition */
-#include
-#include
-
-/* bison type definitions */
-#define YYSTYPE PyObject*
-/* extern functions found in htmllex.l */
-extern int yylex(YYSTYPE* yylvalp, void* scanner);
-extern int htmllexInit (void** scanner, UserData* data);
-extern int htmllexDebug (void** scanner, int debug);
-extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
-extern int htmllexStop (void* scanner, UserData* data);
-extern int htmllexDestroy (void* scanner);
-extern UserData* yyget_extra(void* scanner);
-extern int yyget_lineno(void*);
-#define YYERROR_VERBOSE 1
-
-/* standard error reporting, indicating an internal error */
-static void yyerror (void *locp, char const *msg) {
- fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
-}
-
-/* Python 2/3 compatibility */
-#if PY_MAJOR_VERSION >= 3
- #define MOD_ERROR_VAL NULL
- #define MOD_SUCCESS_VAL(val) val
- #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
- #define MOD_DEF(ob, name, doc, methods) \
- static struct PyModuleDef moduledef = { \
- PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
- ob = PyModule_Create(&moduledef)
- #define PyInt_FromLong PyLong_FromLong
-#else
- #define MOD_ERROR_VAL
- #define MOD_SUCCESS_VAL(val)
- #define MOD_INIT(name) void init##name(void)
- #define MOD_DEF(ob, name, doc, methods) \
- ob = Py_InitModule3(name, methods, doc)
-#endif
-
-
-/* existing Python methods */
-
-/* parser.resolve_entities */
-static PyObject* resolve_entities;
-/* ListDict class, sorted dictionary */
-static PyObject* list_dict;
-/* set_encoding helper function */
-static PyObject* set_encoding;
-/* set_doctype helper function */
-static PyObject* set_doctype;
-/* the unicode string u'meta' */
-static PyObject* u_meta;
-
-/* macros for easier scanner state manipulation */
-
-/* clear buffer b, returning NULL on error */
-#define CLEAR_BUF(b) \
- PyMem_Resize(b, char, 1); \
- if (b == NULL) return NULL; \
- (b)[0] = '\0'
-
-/* clear buffer b, returning NULL and decref self on error */
-#define CLEAR_BUF_DECREF(self, b) \
- PyMem_Resize(b, char, 1); \
- if (b == NULL) { Py_DECREF(self); return NULL; } \
- (b)[0] = '\0'
-
-/* check an error condition and if true set error flag and goto given label */
-#define CHECK_ERROR(cond, label) \
- if (cond) { \
- error = 1; \
- goto label; \
- }
-
-/* generic Python callback macro */
-#define CALLBACK(ud, attr, format, arg, label) \
- if (PyObject_HasAttrString(ud->handler, attr) == 1) { \
- callback = PyObject_GetAttrString(ud->handler, attr); \
- CHECK_ERROR((callback == NULL), label); \
- result = PyObject_CallFunction(callback, format, arg); \
- CHECK_ERROR((result == NULL), label); \
- Py_CLEAR(callback); \
- Py_CLEAR(result); \
- }
-
-/* set old line and column */
-#define SET_OLD_LINECOL \
- ud->last_lineno = ud->lineno; \
- ud->last_column = ud->column
-
-/* parser type definition */
-typedef struct {
- PyObject_HEAD
- /* the handler object */
- PyObject* handler;
- /* the charset encoding (PyBytesObject) */
- PyObject* encoding;
- /* the document type (PyBytesObject) */
- PyObject* doctype;
- UserData* userData;
- void* scanner;
-} parser_object;
-
-/* use Pythons memory management */
-#define YYMALLOC PyMem_Malloc
-#define YYFREE PyMem_Free
-
-/* Test whether tag does not need an HTML end tag.
- @ptag: ASCII encoded Python string in lowercase (!)
- @parser: SAX parser object
- @return: < 0 on error, > 0 if HTML end tag is needed, else 0
-*/
-static int html_end_tag (PyObject* ptag, PyObject* parser) {
- PyObject* pdoctype = NULL;
- char* doctype;
- int error = 0;
- int ret = 1;
- pdoctype = PyObject_GetAttrString(parser, "doctype");
- CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
- doctype = PyBytes_AsString(pdoctype);
- CHECK_ERROR((doctype == NULL), finish_html_end_tag);
- /* check for HTML (else it's presumably XHTML) */
- if (strcmp(doctype, "HTML") == 0) {
- char* tag = PyBytes_AsString(ptag);
- CHECK_ERROR((tag == NULL), finish_html_end_tag);
- ret = strcmp(tag, "area")!=0 &&
- strcmp(tag, "base")!=0 &&
- strcmp(tag, "basefont")!=0 &&
- strcmp(tag, "br")!=0 &&
- strcmp(tag, "col")!=0 &&
- strcmp(tag, "frame")!=0 &&
- strcmp(tag, "hr")!=0 &&
- strcmp(tag, "img")!=0 &&
- strcmp(tag, "input")!=0 &&
- strcmp(tag, "isindex")!=0 &&
- strcmp(tag, "link")!=0 &&
- strcmp(tag, "meta")!=0 &&
- strcmp(tag, "param")!=0;
- }
-finish_html_end_tag:
- Py_XDECREF(pdoctype);
- if (error) {
- return -1;
- }
- return ret;
-}
-
-%}
-
-/* parser options */
-%verbose
-%debug
-%defines
-%pure-parser
-%param {PyObject* scanner}
-
-/* parser tokens, see below for what they mean */
-%token T_WAIT
-%token T_ERROR
-%token T_TEXT
-%token T_ELEMENT_START
-%token T_ELEMENT_START_END
-%token T_ELEMENT_END
-%token T_SCRIPT
-%token T_STYLE
-%token T_PI
-%token T_COMMENT
-%token T_CDATA
-%token T_DOCTYPE
-
-/* note: the finish_ labels are for error recovery */
-%%
-
-elements: element {
- /* parse a single element */
-}
-| elements element {
- /* parse a list of elements */
-}
-;
-
-element: T_WAIT {
- /* wait for more lexer input */
- YYACCEPT;
-}
-| T_ERROR
-{
- /* an error occured in the scanner, the python exception must be set */
- UserData* ud = yyget_extra(scanner);
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
-}
-| T_ELEMENT_START
-{
- /* parsed HTML start tag (eg. )
- $1 is a PyTuple (, )
- is a PyObject, is a ListDict */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- PyObject* tag = PyTuple_GET_ITEM($1, 0);
- PyObject* attrs = PyTuple_GET_ITEM($1, 1);
- int error = 0;
- int cmp;
- CHECK_ERROR((tag == NULL || attrs == NULL), finish_start);
- cmp = PyObject_RichCompareBool(tag, u_meta, Py_EQ);
- CHECK_ERROR((cmp == -1), finish_start);
- if (cmp == 1) {
- /* set encoding */
- result = PyObject_CallFunction(set_encoding, "OO", ud->parser, attrs);
- CHECK_ERROR((result == NULL), finish_start);
- Py_CLEAR(result);
- }
- if (PyObject_HasAttrString(ud->handler, "start_element") == 1) {
- callback = PyObject_GetAttrString(ud->handler, "start_element");
- CHECK_ERROR((!callback), finish_start);
- result = PyObject_CallFunction(callback, "OO", tag, attrs);
- CHECK_ERROR((!result), finish_start);
- Py_CLEAR(callback);
- Py_CLEAR(result);
- }
-finish_start:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_XDECREF(tag);
- Py_XDECREF(attrs);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_ELEMENT_START_END
-{
- /* parsed HTML start-end tag (eg.
)
- $1 is a PyTuple (, )
- is a PyObject, is a ListDict */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- PyObject* tag = PyTuple_GET_ITEM($1, 0);
- PyObject* attrs = PyTuple_GET_ITEM($1, 1);
- int error = 0;
- int cmp;
- char* fname;
- PyObject* tagname;
- CHECK_ERROR((tag == NULL || attrs == NULL), finish_start_end);
- tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
- CHECK_ERROR((tagname == NULL), finish_start_end);
- cmp = PyObject_RichCompareBool(tag, u_meta, Py_EQ);
- CHECK_ERROR((cmp == -1), finish_start_end);
- if (cmp == 1) {
- /* set encoding */
- result = PyObject_CallFunction(set_encoding, "OO", ud->parser, attrs);
- CHECK_ERROR((result == NULL), finish_start_end);
- Py_CLEAR(result);
- }
- cmp = html_end_tag(tagname, ud->parser);
- CHECK_ERROR((cmp < 0), finish_start_end);
- fname = (cmp == 0 ? "start_element" : "start_end_element");
- if (PyObject_HasAttrString(ud->handler, fname) == 1) {
- callback = PyObject_GetAttrString(ud->handler, fname);
- CHECK_ERROR((!callback), finish_start_end);
- result = PyObject_CallFunction(callback, "OO", tag, attrs);
- CHECK_ERROR((!result), finish_start_end);
- Py_CLEAR(callback);
- Py_CLEAR(result);
- }
-finish_start_end:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_XDECREF(tag);
- Py_XDECREF(attrs);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_ELEMENT_END
-{
- /* parsed HTML end tag (eg. )
- $1 is a PyUnicode with the tag name */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- int cmp;
- /* encode tagname in ASCII, ignoring any unknown chars */
- PyObject* tagname = PyUnicode_AsEncodedString($1, "ascii", "ignore");
- if (tagname == NULL) {
- error = 1;
- goto finish_end;
- }
- cmp = html_end_tag(tagname, ud->parser);
- CHECK_ERROR((cmp < 0), finish_end);
- if (PyObject_HasAttrString(ud->handler, "end_element") == 1 && cmp > 0) {
- callback = PyObject_GetAttrString(ud->handler, "end_element");
- CHECK_ERROR((callback == NULL), finish_end);
- result = PyObject_CallFunction(callback, "O", $1);
- CHECK_ERROR((result == NULL), finish_end);
- Py_CLEAR(callback);
- Py_CLEAR(result);
- }
-finish_end:
- Py_XDECREF(tagname);
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_COMMENT
-{
- /* parsed HTML comment (eg. )
- $1 is a PyUnicode with the comment content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "comment", "O", $1, finish_comment);
-finish_comment:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_PI
-{
- /* $1 is a PyUnicode */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "pi", "O", $1, finish_pi);
-finish_pi:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_CDATA
-{
- /* parsed HTML CDATA (eg. )
- $1 is a PyUnicode with the CDATA content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "cdata", "O", $1, finish_cdata);
-finish_cdata:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_DOCTYPE
-{
- /* parsed HTML doctype (eg. )
- $1 is a PyUnicode with the doctype content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- /* set encoding */
- result = PyObject_CallFunction(set_doctype, "OO", ud->parser, $1);
- CHECK_ERROR((result == NULL), finish_doctype);
- Py_CLEAR(result);
- CALLBACK(ud, "doctype", "O", $1, finish_doctype);
-finish_doctype:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_SCRIPT
-{
- /* parsed HTML script content (plus end tag which is omitted)
- $1 is a PyUnicode with the script content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
- CHECK_ERROR((script == NULL), finish_script);
- CALLBACK(ud, "characters", "O", $1, finish_script);
- /* emit the omitted end tag */
- CALLBACK(ud, "end_element", "O", script, finish_script);
-finish_script:
- Py_XDECREF(callback);
- Py_XDECREF(script);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_STYLE
-{
- /* parsed HTML style content (plus end tag which is omitted)
- $1 is a PyUnicode with the style content */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
- CHECK_ERROR((style == NULL), finish_style);
- CALLBACK(ud, "characters", "O", $1, finish_style);
- /* emit the omitted end tag */
- CALLBACK(ud, "end_element", "O", style, finish_style);
-finish_style:
- Py_XDECREF(callback);
- Py_XDECREF(style);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-| T_TEXT
-{
- /* parsed HTML text data
- $1 is a PyUnicode with the text */
- /* Remember this is also called as a lexer fallback when no
- HTML structure element could be recognized. */
- UserData* ud = yyget_extra(scanner);
- PyObject* callback = NULL;
- PyObject* result = NULL;
- int error = 0;
- CALLBACK(ud, "characters", "O", $1, finish_characters);
-finish_characters:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_DECREF($1);
- if (error) {
- PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
- YYABORT;
- }
- SET_OLD_LINECOL;
-}
-;
-
-%%
-
-/* create parser object */
-static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
- parser_object* self;
- if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL) {
- return NULL;
- }
- Py_INCREF(Py_None);
- self->handler = Py_None;
- /* reset userData */
- self->userData = PyMem_New(UserData, sizeof(UserData));
- if (self->userData == NULL) {
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->userData->handler = self->handler;
- self->userData->buf = NULL;
- CLEAR_BUF_DECREF(self, self->userData->buf);
- self->userData->nextpos = 0;
- self->userData->bufpos = 0;
- self->userData->pos = 0;
- self->userData->column = 1;
- self->userData->last_column = 1;
- self->userData->lineno = 1;
- self->userData->last_lineno = 1;
- self->userData->tmp_buf = NULL;
- CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
- self->userData->tmp_tag = self->userData->tmp_attrname =
- self->userData->tmp_attrval = self->userData->tmp_attrs =
- self->userData->lexbuf = NULL;
- self->userData->resolve_entities = resolve_entities;
- self->userData->list_dict = list_dict;
- self->userData->exc_type = NULL;
- self->userData->exc_val = NULL;
- self->userData->exc_tb = NULL;
- self->scanner = NULL;
- if (htmllexInit(&(self->scanner), self->userData)!=0) {
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->encoding = PyBytes_FromString("iso8859-1");
- if (self->encoding == NULL) {
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->doctype = PyBytes_FromString("HTML");
- if (self->doctype == NULL) {
- Py_DECREF(self->encoding);
- Py_DECREF(self->handler);
- Py_DECREF(self);
- return NULL;
- }
- self->userData->parser = (PyObject*)self;
- return (PyObject*) self;
-}
-
-
-/* initialize parser object */
-static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
- PyObject* handler = NULL;
- static char *kwlist[] = {"handler", NULL};
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
- return -1;
- }
- if (handler == NULL) {
- return 0;
- }
- Py_DECREF(self->handler);
- Py_INCREF(handler);
- self->handler = handler;
- self->userData->handler = self->handler;
- return 0;
-}
-
-
-/* traverse all used subobjects participating in reference cycles */
-static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
- Py_VISIT(self->handler);
- return 0;
-}
-
-
-/* clear all used subobjects participating in reference cycles */
-static int parser_clear (parser_object* self) {
- self->userData->handler = NULL;
- Py_CLEAR(self->handler);
- return 0;
-}
-
-
-/* free all allocated resources of parser object */
-static void parser_dealloc (parser_object* self) {
- htmllexDestroy(self->scanner);
- parser_clear(self);
- self->userData->parser = NULL;
- Py_CLEAR(self->encoding);
- Py_CLEAR(self->doctype);
- PyMem_Del(self->userData->buf);
- PyMem_Del(self->userData->tmp_buf);
- PyMem_Del(self->userData);
- Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-
-/* feed a chunk of data to the parser */
-static PyObject* parser_feed (parser_object* self, PyObject* args) {
- /* set up the parse string */
- int slen = 0;
- char* s = NULL;
- if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
- PyErr_SetString(PyExc_TypeError, "string arg required");
- return NULL;
- }
- /* parse */
- if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not start scanner");
- return NULL;
- }
- if (yyparse(self->scanner)!=0) {
- if (self->userData->exc_type!=NULL) {
- /* note: we give away these objects, so don't decref */
- PyErr_Restore(self->userData->exc_type,
- self->userData->exc_val,
- self->userData->exc_tb);
- }
- htmllexStop(self->scanner, self->userData);
- return NULL;
- }
- if (htmllexStop(self->scanner, self->userData)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
- return NULL;
- }
- Py_RETURN_NONE;
-}
-
-
-/* flush all parser buffers */
-static PyObject* parser_flush (parser_object* self, PyObject* args) {
- int res = 0;
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- /* reset parser variables */
- CLEAR_BUF(self->userData->tmp_buf);
- Py_CLEAR(self->userData->tmp_tag);
- Py_CLEAR(self->userData->tmp_attrs);
- Py_CLEAR(self->userData->tmp_attrval);
- Py_CLEAR(self->userData->tmp_attrname);
- self->userData->bufpos = 0;
- if (strlen(self->userData->buf)) {
- int error = 0;
- int i;
- PyObject* callback = NULL;
- PyObject* result = NULL;
- const char* enc;
- PyObject* s;
- /* set line, col */
- for (i=0; iuserData->buf); ++i) {
- if (self->userData->buf[i] == '\n') {
- ++(self->userData->lineno);
- self->userData->column = 1;
- }
- else ++(self->userData->column);
- }
- enc = PyBytes_AsString(self->encoding);
- s = PyUnicode_Decode(self->userData->buf,
- (Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
- /* reset buffer */
- CLEAR_BUF(self->userData->buf);
- if (s == NULL) { error = 1; goto finish_flush; }
- if (PyObject_HasAttrString(self->handler, "characters") == 1) {
- callback = PyObject_GetAttrString(self->handler, "characters");
- if (callback == NULL) { error = 1; goto finish_flush; }
- result = PyObject_CallFunction(callback, "O", s);
- if (result == NULL) { error = 1; goto finish_flush; }
- }
- finish_flush:
- Py_XDECREF(callback);
- Py_XDECREF(result);
- Py_XDECREF(s);
- if (error == 1) {
- return NULL;
- }
- }
- if (htmllexDestroy(self->scanner)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
- return NULL;
- }
- self->scanner = NULL;
- if (htmllexInit(&(self->scanner), self->userData)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
- return NULL;
- }
- return Py_BuildValue("i", res);
-}
-
-
-/* return the current parser line number */
-static PyObject* parser_lineno (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->lineno);
-}
-
-
-/* return the last parser line number */
-static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->last_lineno);
-}
-
-
-/* return the current parser column number */
-static PyObject* parser_column (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->column);
-}
-
-
-/* return the last parser column number */
-static PyObject* parser_last_column (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->last_column);
-}
-
-
-/* return the parser position in data stream */
-static PyObject* parser_pos (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- return Py_BuildValue("i", self->userData->pos);
-}
-
-
-/* return buffered parser data up to given length */
-static PyObject* parser_peek (parser_object* self, PyObject* args) {
- Py_ssize_t len, buflen;
- if (!PyArg_ParseTuple(args, "n", &len)) {
- return NULL;
- }
- if (len < 0) {
- PyErr_SetString(PyExc_TypeError, "peek length must not be negative");
- return NULL;
- }
- buflen = strlen(self->userData->buf);
- if (!buflen || self->userData->bufpos >= buflen) {
- return PyBytes_FromString("");
- }
- if (self->userData->bufpos + len >= buflen) {
- len = buflen - self->userData->bufpos - 1;
- }
- return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
-}
-
-
-/* reset the parser. This will erase all buffered data! */
-static PyObject* parser_reset (parser_object* self, PyObject* args) {
- if (!PyArg_ParseTuple(args, "")) {
- PyErr_SetString(PyExc_TypeError, "no args required");
- return NULL;
- }
- if (htmllexDestroy(self->scanner)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
- return NULL;
- }
- /* reset buffer */
- CLEAR_BUF(self->userData->buf);
- CLEAR_BUF(self->userData->tmp_buf);
- self->userData->bufpos =
- self->userData->pos =
- self->userData->nextpos = 0;
- self->userData->column =
- self->userData->last_column =
- self->userData->lineno =
- self->userData->last_lineno = 1;
- self->userData->tmp_tag = self->userData->tmp_attrs =
- self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
- self->scanner = NULL;
- if (htmllexInit(&(self->scanner), self->userData)!=0) {
- PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
- return NULL;
- }
- Py_RETURN_NONE;
-}
-
-
-/* set the debug level, if its >0, debugging is on, =0 means off */
-static PyObject* parser_debug (parser_object* self, PyObject* args) {
- int debug;
- if (!PyArg_ParseTuple(args, "i", &debug)) {
- return NULL;
- }
- yydebug = debug;
- debug = htmllexDebug(&(self->scanner), debug);
- return PyInt_FromLong((long)debug);
-}
-
-
-/* get SAX handler object */
-static PyObject* parser_gethandler (parser_object* self, void* closure) {
- Py_INCREF(self->handler);
- return self->handler;
-}
-
-
-/* set SAX handler object */
-static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
- if (value == NULL) {
- PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
- return -1;
- }
- Py_DECREF(self->handler);
- Py_INCREF(value);
- self->handler = value;
- self->userData->handler = value;
- return 0;
-}
-
-
-/* get parser encoding */
-static PyObject* parser_getencoding (parser_object* self, void* closure) {
- Py_INCREF(self->encoding);
- return self->encoding;
-}
-
-
-/* set parser encoding */
-static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
- if (value == NULL) {
- PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
- return -1;
- }
- if (!PyBytes_CheckExact(value)) {
- PyErr_SetString(PyExc_TypeError, "encoding must be string");
- return -1;
- }
- Py_DECREF(self->encoding);
- Py_INCREF(value);
- self->encoding = value;
- if (yydebug > 0) {
- /* print debug message */
- PyObject* repr = PyObject_Repr(value);
- if (repr == NULL) {
- return -1;
- }
- fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
- Py_DECREF(repr);
- }
- return 0;
-}
-
-
-/* get parser doctype */
-static PyObject* parser_getdoctype (parser_object* self, void* closure) {
- Py_INCREF(self->doctype);
- return self->doctype;
-}
-
-
-/* set parser doctype */
-static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
- if (value == NULL) {
- PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
- return -1;
- }
- if (!PyBytes_CheckExact(value)) {
- PyObject* repr = PyObject_Repr(value);
- char* cp = PyBytes_AsString(repr);
- if (NULL == cp)
- return -1;
- PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
- return -1;
- }
- Py_DECREF(self->doctype);
- Py_INCREF(value);
- self->doctype = value;
- return 0;
-}
-
-
-/* type interface */
-
-static PyMemberDef parser_members[] = {
- {NULL} /* Sentinel */
-};
-
-static PyGetSetDef parser_getset[] = {
- {"handler", (getter)parser_gethandler, (setter)parser_sethandler,
- "handler object", NULL},
- {"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
- "encoding", NULL},
- {"doctype", (getter)parser_getdoctype, (setter)parser_setdoctype,
- "doctype", NULL},
- {NULL} /* Sentinel */
-};
-
-static PyMethodDef parser_methods[] = {
- {"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
- {"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
- {"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
- {"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
- {"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
- {"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
- {"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
- {"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
- {"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
- {"peek", (PyCFunction)parser_peek, METH_VARARGS, "get up to given length of buffered data from current parse position"},
- {NULL} /* Sentinel */
-};
-
-
-static PyTypeObject parser_type = {
- PyVarObject_HEAD_INIT(NULL, 0)
- "linkcheck.HtmlParser.htmlsax.parser", /* tp_name */
- sizeof(parser_object), /* tp_size */
- 0, /* tp_itemsize */
- /* methods */
- (destructor)parser_dealloc, /* tp_dealloc */
- 0, /* tp_print */
- 0, /* tp_getattr */
- 0, /* tp_setattr */
- 0, /* tp_compare */
- 0, /* tp_repr */
- 0, /* tp_as_number */
- 0, /* tp_as_sequence */
- 0, /* tp_as_mapping */
- 0, /* tp_hash */
- 0, /* tp_call */
- 0, /* tp_str */
- 0, /* tp_getattro */
- 0, /* tp_setattro */
- 0, /* tp_as_buffer */
- Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
- Py_TPFLAGS_HAVE_GC, /* tp_flags */
- "HTML parser object", /* tp_doc */
- (traverseproc)parser_traverse, /* tp_traverse */
- (inquiry)parser_clear, /* tp_clear */
- 0, /* tp_richcompare */
- 0, /* tp_weaklistoffset */
- 0, /* tp_iter */
- 0, /* tp_iternext */
- parser_methods, /* tp_methods */
- parser_members, /* tp_members */
- parser_getset, /* tp_getset */
- 0, /* tp_base */
- 0, /* tp_dict */
- 0, /* tp_descr_get */
- 0, /* tp_descr_set */
- 0, /* tp_dictoffset */
- (initproc)parser_init, /* tp_init */
- 0, /* tp_alloc */
- parser_new, /* tp_new */
- 0, /* tp_free */
- 0, /* tp_is_gc */
- 0, /* tp_bases */
- 0, /* tp_mro */
- 0, /* tp_cache */
- 0, /* tp_subclasses */
- 0, /* tp_weaklist */
- 0, /* tp_del */
-};
-
-
-static PyMethodDef htmlsax_methods[] = {
- {NULL} /* Sentinel */
-};
-
-
-/* initialization of the htmlsax module */
-MOD_INIT(htmlsax) {
- PyObject* m = NULL;
- MOD_DEF(m, "htmlsax", "SAX HTML parser routines", htmlsax_methods);
- if (m == NULL) {
- return MOD_ERROR_VAL;
- }
- if (PyType_Ready(&parser_type) < 0) {
- return MOD_ERROR_VAL;
- }
- Py_INCREF(&parser_type);
- if (PyModule_AddObject(m, "parser", (PyObject*)&parser_type) == -1) {
- /* init error */
- PyErr_Print();
- }
- PyObject* h = NULL;
- if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
- return MOD_ERROR_VAL;
- }
- if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
- Py_DECREF(resolve_entities);
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
- Py_DECREF(resolve_entities);
- Py_DECREF(set_encoding);
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- Py_DECREF(h);
- if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
- return MOD_ERROR_VAL;
- }
- if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
- return MOD_ERROR_VAL;
- }
- if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
- Py_DECREF(h);
- return MOD_ERROR_VAL;
- }
- Py_DECREF(h);
- return MOD_SUCCESS_VAL(m);
-}
diff --git a/linkcheck/HtmlParser/htmlsax.h b/linkcheck/HtmlParser/htmlsax.h
deleted file mode 100644
index c5812c5f..00000000
--- a/linkcheck/HtmlParser/htmlsax.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (C) 2000-2014 Bastian Kleineidam
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-/*
- Includes header definitions for the HTML Sax parser Python module.
- */
-#ifndef HTMLSAX_H
-#define HTMLSAX_H
-
-#include "Python.h"
-
-/* require Python >= 2.6 */
-#ifndef PY_VERSION_HEX
-#error please install Python >= 2.6
-#endif
-
-#if PY_VERSION_HEX < 0x02060000
-#error please install Python >= 2.6
-#endif
-
-/* user_data type for SAX calls */
-typedef struct {
- /* the Python SAX object to issue callbacks */
- PyObject* handler;
- /* Buffer to store still-to-be-scanned characters. After recognizing
- * a complete syntax element, all data up to bufpos will be removed.
- * Before scanning you should append new data to this buffer.
- */
- char* buf;
- /* current position in the buffer counting from zero */
- unsigned int bufpos;
- /* current position of next syntax element */
- unsigned int nextpos;
- /* position in the stream of data already seen, counting from zero */
- unsigned int pos;
- /* line counter, counting from one */
- unsigned int lineno;
- /* column counter, counting from zero */
- unsigned int column;
- /* value of line counter before the current token */
- unsigned int last_lineno;
- /* value of column counter before the current token */
- unsigned int last_column;
- /* input buffer of lexer, must be deleted when the parsing stops */
- void* lexbuf;
- /* temporary character buffer */
- char* tmp_buf;
- /* temporary HTML start or end tag name */
- PyObject* tmp_tag;
- /* temporary HTML start tag attribute name */
- PyObject* tmp_attrname;
- /* temporary HTML start tag attribute value */
- PyObject* tmp_attrval;
- /* temporary HTML start tag attribute list (a SortedDict) */
- PyObject* tmp_attrs;
- /* HtmlParser.resolve_entities */
- PyObject* resolve_entities;
- /* HtmlParser.SortedDict */
- PyObject* list_dict;
- /* stored Python exception (if error occurred in scanner) */
- PyObject* exc_type;
- PyObject* exc_val;
- PyObject* exc_tb;
- /* the parser object itself */
- PyObject* parser;
-} UserData;
-
-#endif
diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
new file mode 100644
index 00000000..a7ad30b5
--- /dev/null
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -0,0 +1,120 @@
+# Copyright (C) 2000-2018 Petr Dlouhy
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+HTML parser implemented using Beautiful Soup and html.parser.
+"""
+
+from io import BytesIO, StringIO
+
+from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
+ Tag)
+
+from ..containers import ListDict
+
+
+class Parser(object):
+ handler = None
+ encoding = None
+
+ def __init__(self, handler):
+ self.handler = handler
+ self.reset()
+
+ def feed(self, feed_text):
+ if not self.html_doc:
+ if isinstance(feed_text, bytes):
+ self.html_doc = BytesIO()
+ else:
+ self.html_doc = StringIO()
+ self.html_doc.write(feed_text)
+
+ def reset(self):
+ self.html_doc = None
+
+ def parse_contents(self, contents):
+ for content in contents:
+ if isinstance(content, Tag):
+ attrs = ListDict()
+ for k, v_list in sorted(content.attrs.items()):
+ if not isinstance(v_list, list):
+ v_list = [v_list]
+ for v in v_list:
+ # empty parameters returned by BS4
+ # are sometimes in bytes:
+ if v == b'':
+ v = u''
+ attrs[k] = v
+ if content.is_empty_element:
+ self.handler.start_end_element(
+ content.name, attrs, content.text.strip(),
+ )
+ else:
+ self.handler.start_element(
+ content.name, attrs, content.text.strip(),
+ )
+ if hasattr(content, 'contents'): # recursion
+ self.parse_contents(content.contents)
+ if hasattr(self.handler, 'end_element'):
+ self.handler.end_element(content.name)
+ if content.comments:
+ for comment in content.comments:
+ if hasattr(self.handler, 'comment'):
+ self.handler.comment(comment)
+ elif isinstance(content, Doctype):
+ if hasattr(self.handler, 'doctype'):
+ self.handler.doctype(content[7:])
+ elif isinstance(content, Comment):
+ if hasattr(self.handler, 'comment'):
+ self.handler.comment(content.strip())
+ elif isinstance(content, CData):
+ if hasattr(self.handler, 'cdata'):
+ self.handler.cdata(content)
+ elif isinstance(content, ProcessingInstruction):
+ if hasattr(self.handler, 'pi'):
+ self.handler.pi(content.strip("? "))
+ else:
+ if hasattr(self.handler, 'characters'):
+ self.handler.characters(content)
+
+ def flush(self):
+ soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
+ if hasattr(soup, 'contents'):
+ self.parse_contents(soup.contents)
+ self.encoding = soup.original_encoding
+
+ def debug(self, text):
+ raise NotImplementedError("debug is not implemented")
+
+ def lineno(self):
+ # It seems, that getting line number of element is not
+ # implemented in BeautifulSoup, so this is faked
+ return 0
+
+ def last_lineno(self):
+ return 0
+
+ def column(self):
+ return 0
+
+ def last_column(self):
+ return 0
+
+ def pos(self, text):
+ return 0
+
+
+def parser(handler=None):
+ return Parser(handler)
diff --git a/linkcheck/HtmlParser/s_util.c b/linkcheck/HtmlParser/s_util.c
deleted file mode 100644
index 7611d9a7..00000000
--- a/linkcheck/HtmlParser/s_util.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * linux/lib/string.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-#include "Python.h"
-
-#if !defined(HAVE_STRLCPY)
-/**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
- * @dst: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
-size_t strlcpy (char *dst, const char *src, size_t size)
-{
- size_t ret = strlen(src);
- if (size > 0) {
- size_t len = (ret >= size) ? size-1 : ret;
- Py_MEMCPY(dst, src, len);
- dst[len] = '\0';
- }
- return ret;
-}
-#endif /* !HAVE_STRLCPY */
-
-#if !defined(HAVE_STRLCAT)
-/**
- * strlcat - Append a length-limited, %NUL-terminated string to another
- * @dst: The string to be appended to
- * @src: The string to append to it
- * @size: The size of the destination buffer.
- */
-size_t strlcat (char *dst, const char *src, size_t size)
-{
- size_t dsize = strlen(dst);
- size_t len = strlen(src);
- size_t res = dsize + len;
- dst += dsize;
- size -= dsize;
- if (len >= size)
- len = size-1;
- Py_MEMCPY(dst, src, len);
- dst[len] = '\0';
- return res;
-}
-#endif /* !HAVE_STRLCAT */
diff --git a/linkcheck/HtmlParser/s_util.h b/linkcheck/HtmlParser/s_util.h
deleted file mode 100644
index a0102806..00000000
--- a/linkcheck/HtmlParser/s_util.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * linux/lib/string.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-#if !defined(HAVE_STRLCPY)
-size_t strlcpy(char *dst, const char *src, size_t size);
-#endif /* !HAVE_STRLCPY */
-
-#if !defined(HAVE_STRLCAT)
-size_t strlcat(char *dst, const char *src, size_t size);
-#endif /* !HAVE_STRLCAT */
diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py
index 5662777c..f2c2909d 100644
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@@ -115,10 +115,10 @@ class TagFinder (object):
"""Does nothing, override in a subclass."""
pass
- def start_end_element (self, tag, attrs):
+ def start_end_element (self, tag, attrs, element_text=None):
"""Delegate a combined start/end element (eg.
) to
the start_element method. Ignore the end element part."""
- self.start_element(tag, attrs)
+ self.start_element(tag, attrs, element_text)
class MetaRobotsFinder (TagFinder):
diff --git a/setup.py b/setup.py
index 97005df2..3afca9a7 100755
--- a/setup.py
+++ b/setup.py
@@ -466,20 +466,6 @@ args = dict(
'linkcheck.parser',
'linkcheck.plugins',
],
- ext_modules = [
- Extension('linkcheck.HtmlParser.htmlsax',
- sources = [
- 'linkcheck/HtmlParser/htmllex.c',
- 'linkcheck/HtmlParser/htmlparse.c',
- 'linkcheck/HtmlParser/s_util.c',
- ],
- extra_compile_args = extra_compile_args,
- library_dirs = library_dirs,
- libraries = libraries,
- define_macros = define_macros + [('YY_NO_INPUT', None)],
- include_dirs = include_dirs + [normpath("linkcheck/HtmlParser")],
- ),
- ],
scripts = scripts,
data_files = data_files,
classifiers = [
diff --git a/windows/build.bat b/windows/build.bat
index 3e9f5ce1..b7ea3f03 100644
--- a/windows/build.bat
+++ b/windows/build.bat
@@ -38,7 +38,5 @@ if defined MSSdk (
%PYDIR%\python.exe setup.py sdist --manifest-only
%PYDIR%\python.exe setup.py build %COMPILER%
-:: copy .pyd files to start linkchecker in local directory
-copy build\lib.%PLATFORM%-%PYVER%\linkcheck\HtmlParser\htmlsax.pyd linkcheck\HtmlParser
:finish
diff --git a/windows/clean.bat b/windows/clean.bat
index 532b3acf..ed991771 100644
--- a/windows/clean.bat
+++ b/windows/clean.bat
@@ -16,6 +16,5 @@
@echo off
set PYDIR=C:\Python27
%PYDIR%\python.exe setup.py clean --all
-del linkcheck\HtmlParser\htmlsax.pyd
del doc\html\lccollection.qhc
del doc\html\lcdoc.qch