mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-28 01:54:42 +00:00
remove parser errors/warnings, and better error detection
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2926 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
dffe4f906e
commit
de2e66e713
6 changed files with 5244 additions and 4999 deletions
File diff suppressed because it is too large
Load diff
|
|
@ -14,10 +14,15 @@
|
|||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
/* HTML lexical anaylizer. Finds recognizable tokens in (probably
|
||||
bad formatted) HTML streams.
|
||||
Unrecognizable character data is passed on as a TEXT token.
|
||||
*/
|
||||
|
||||
/* Lexical analyzer for finding recognizable tokens in (probably
|
||||
* bad formatted) HTML streams.
|
||||
* Unrecognizable character data is passed on as a TEXT token.
|
||||
*
|
||||
* Note that you cannot rely on the "longest match" preference of
|
||||
* flex here since input data might be truncated at any given position.
|
||||
* This explains some of the more complicated lookahead rules below.
|
||||
*/
|
||||
|
||||
%{
|
||||
#include "htmlsax.h"
|
||||
|
|
@ -31,12 +36,15 @@
|
|||
/* type of user-specified data */
|
||||
#define YY_EXTRA_TYPE UserData*
|
||||
|
||||
/* Return T_ERROR if argument is NULL. Returning T_ERROR is the standard
|
||||
* error-out reaction for this lexer.
|
||||
*/
|
||||
/* Returning T_ERROR is the standard error-out reaction for this lexer. */
|
||||
/* Return T_ERROR if argument is NULL. */
|
||||
#define CHECK_NULL(a) \
|
||||
if ((a) == NULL) return T_ERROR
|
||||
|
||||
/* Return T_ERROR if argument is -1 (minus one). */
|
||||
#define CHECK_MINUSONE(a) \
|
||||
if ((a) == -1) return T_ERROR
|
||||
|
||||
/* resize buffer b, returning T_ERROR on error */
|
||||
#define RESIZE_BUF(b, n) \
|
||||
CHECK_NULL((b) = PyMem_Resize((b), char, (n))); \
|
||||
|
|
@ -117,29 +125,22 @@
|
|||
if (strlen(yyextra->tmp_buf) > 0) { \
|
||||
PYSTRING_TMP_UNICODE(yyextra->tmp_attrname); \
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1); \
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None)==-1) return T_ERROR; \
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None)); \
|
||||
Py_CLEAR(yyextra->tmp_attrname); \
|
||||
}
|
||||
|
||||
/* add error message to error list */
|
||||
#define ADD_ERROR(msg) \
|
||||
if (PyList_Append(yyextra->errors, msg) != 0) { \
|
||||
Py_DECREF(msg); \
|
||||
return T_ERROR; \
|
||||
}
|
||||
|
||||
/* update the buffer and scanner positions */
|
||||
#define _UPDATE_BUFPOS yyextra->bufpos += yyleng; yyextra->pos += yyleng
|
||||
#define UPDATE_BUFPOS yyextra->bufpos += yyleng; yyextra->pos += yyleng
|
||||
|
||||
/* update the column position; use this *only* in rules that cannot match
|
||||
the newline char '\n'!
|
||||
*/
|
||||
#define UPDATE_COLUMN _UPDATE_BUFPOS; yyextra->column += yyleng
|
||||
#define UPDATE_COLUMN UPDATE_BUFPOS; yyextra->column += yyleng
|
||||
|
||||
/* update the line and column position; use this in rules that can match the
|
||||
newline char '\n'.
|
||||
*/
|
||||
#define UPDATE_LINE _UPDATE_BUFPOS; { \
|
||||
#define UPDATE_LINE UPDATE_BUFPOS; { \
|
||||
int i; \
|
||||
for (i=0; i<yyleng; ++i) { \
|
||||
if (yytext[i] == '\n') { \
|
||||
|
|
@ -234,17 +235,11 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<INITIAL><![ ]+-- {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromString("malformed comment begin `<! --'"));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_COLUMN;
|
||||
BEGIN(S_COMMENT);
|
||||
}
|
||||
|
||||
<INITIAL><!-/[ ] {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromString("malformed comment begin `<!- '"));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_COLUMN;
|
||||
BEGIN(S_COMMENT);
|
||||
}
|
||||
|
|
@ -257,9 +252,6 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<S_COMMENT>--[ ]+> {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromString("malformed comment end `-- >'"));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_COLUMN;
|
||||
SETLVAL_UNICODE;
|
||||
BEGIN(INITIAL);
|
||||
|
|
@ -267,9 +259,6 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<S_COMMENT>-> {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromString("malformed comment end `->'"));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_COLUMN;
|
||||
SETLVAL_UNICODE;
|
||||
BEGIN(INITIAL);
|
||||
|
|
@ -306,7 +295,7 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
/* Note: www.nba.com had some <! Copyright !> comment */
|
||||
<INITIAL><![ ]+ {
|
||||
<INITIAL><![ ]+/[^-] {
|
||||
UPDATE_COLUMN;
|
||||
BEGIN(S_COMMENT2);
|
||||
}
|
||||
|
|
@ -738,14 +727,20 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_ATTR2>\\\n {
|
||||
<S_ATTR2>\\\r?\n {
|
||||
/* Line continuations */
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromString("invalid line continuation `\\\\n'"));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_LINE;
|
||||
}
|
||||
|
||||
<S_ATTR2>\\\r?[^\n] {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_ATTR2>\\\r? {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
<S_ATTR2>{RX_WHITE_SPACE}+ {
|
||||
UPDATE_LINE;
|
||||
BEGIN(S_ATTR3);
|
||||
|
|
@ -765,8 +760,7 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
PYSTRING_TMP_UNICODE(yyextra->tmp_attrname);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
if (yyextra->tmp_attrval != NULL) return T_ERROR;
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
Py_None) == -1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
APPEND_TO_TMP(yyleng);
|
||||
BEGIN(S_ATTR2);
|
||||
|
|
@ -805,8 +799,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval));
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
Py_CLEAR(yyextra->tmp_attrval);
|
||||
SCRIPT_CHECK;
|
||||
|
|
@ -829,8 +823,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval));
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
Py_CLEAR(yyextra->tmp_attrval);
|
||||
SCRIPT_CHECK;
|
||||
|
|
@ -844,8 +838,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval));
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
Py_CLEAR(yyextra->tmp_attrval);
|
||||
BEGIN(INITIAL);
|
||||
|
|
@ -854,16 +848,13 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<S_ATTR5>[\"] {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromString("missing beginning quote in attribute"));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval));
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
Py_CLEAR(yyextra->tmp_attrval);
|
||||
BEGIN(S_ATTR1);
|
||||
|
|
@ -875,33 +866,37 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval));
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
Py_CLEAR(yyextra->tmp_attrval);
|
||||
BEGIN(S_ATTR1);
|
||||
}
|
||||
|
||||
<S_APOSSTRING>\\ {
|
||||
<S_APOSSTRING>\\/\r?[^\n] {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
BEGIN(S_APOSSTRING_ESC);
|
||||
}
|
||||
|
||||
<S_APOSSTRING>\\ {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
<S_APOSSTRING>\' {
|
||||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval));
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
Py_CLEAR(yyextra->tmp_attrval);
|
||||
BEGIN(S_ATTR1);
|
||||
}
|
||||
|
||||
<S_APOSSTRING>[^']+ {
|
||||
<S_APOSSTRING>[^\\']+ {
|
||||
UPDATE_LINE;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
|
@ -913,26 +908,34 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
BEGIN(S_APOSSTRING);
|
||||
}
|
||||
|
||||
<S_STRING>\\ {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
BEGIN(S_STRING_ESC);
|
||||
}
|
||||
|
||||
<S_STRING>\" {
|
||||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP_UNICODE(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
CHECK_NULL(yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval));
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
CHECK_MINUSONE(PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval));
|
||||
Py_CLEAR(yyextra->tmp_attrname);
|
||||
Py_CLEAR(yyextra->tmp_attrval);
|
||||
BEGIN(S_ATTR1);
|
||||
}
|
||||
|
||||
<S_STRING>[^"]+ {
|
||||
<S_STRING>\\/\r?[^\n] {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
BEGIN(S_STRING_ESC);
|
||||
}
|
||||
|
||||
<S_STRING,S_APOSSTRING>\\\r?\n {
|
||||
UPDATE_LINE;
|
||||
}
|
||||
|
||||
<S_STRING>\\ {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
<S_STRING>[^\\"]+ {
|
||||
UPDATE_LINE;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
|
@ -964,9 +967,6 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<S_TAGEND><{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_LINE;
|
||||
LOWER_TMP;
|
||||
SETLVAL_ASCII;
|
||||
|
|
@ -975,9 +975,6 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<S_TAGEND><{RX_WHITE_SPACE}*/[A-Za-z] {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_LINE;
|
||||
LOWER_TMP;
|
||||
SETLVAL_ASCII;
|
||||
|
|
@ -992,6 +989,10 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
BEGIN(S_TAGEND2);
|
||||
}
|
||||
|
||||
<S_TAGEND>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
<S_TAGEND2>> {
|
||||
UPDATE_COLUMN;
|
||||
LOWER_TMP;
|
||||
|
|
@ -1005,9 +1006,6 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<S_TAGEND2><{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_LINE;
|
||||
LOWER_TMP;
|
||||
SETLVAL_ASCII;
|
||||
|
|
@ -1016,9 +1014,6 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
}
|
||||
|
||||
<S_TAGEND2><{RX_WHITE_SPACE}*/[A-Za-z] {
|
||||
PyObject* msg;
|
||||
CHECK_NULL(msg = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf));
|
||||
ADD_ERROR(msg);
|
||||
UPDATE_LINE;
|
||||
LOWER_TMP;
|
||||
SETLVAL_ASCII;
|
||||
|
|
@ -1076,7 +1071,7 @@ int htmllexDebug (void** scanner, int debug) {
|
|||
int htmllexStart (void* scanner, UserData* data, const char* s, int slen) {
|
||||
/* append s to data buffer and scan those bytes.
|
||||
As Flex does not distinguish between NUL and EOF characters,
|
||||
we must replace NUL with ' '. */
|
||||
replace NUL with ' '. */
|
||||
int len = strlen(data->buf);
|
||||
int i;
|
||||
RESIZE_BUF(data->buf, len + slen + 1);
|
||||
|
|
@ -1117,6 +1112,7 @@ int htmllexStop (void* scanner, UserData* data) {
|
|||
data->buf[j] = data->buf[i];
|
||||
}
|
||||
data->buf[j] = '\0';
|
||||
/* Can return T_ERROR, which is guaranteed to be non-zero. */
|
||||
RESIZE_BUF(data->buf, len-data->nextpos + 1);
|
||||
data->bufpos -= data->nextpos;
|
||||
data->nextpos = 0;
|
||||
|
|
|
|||
|
|
@ -45,16 +45,6 @@ class HtmlPrinter (object):
|
|||
"""
|
||||
print >> self.fd, self.mem, attrs
|
||||
|
||||
def error (self, msg):
|
||||
"""
|
||||
Print filter/parser error.
|
||||
|
||||
@param msg: message to print
|
||||
@type msg: string
|
||||
@return: None
|
||||
"""
|
||||
print >> sys.stderr, "error", msg
|
||||
|
||||
def __getattr__ (self, name):
|
||||
"""
|
||||
Remember the called method name in self.mem.
|
||||
|
|
@ -85,7 +75,6 @@ class HtmlPrettyPrinter (object):
|
|||
"""
|
||||
self.fd = fd
|
||||
self.encoding = encoding
|
||||
self.errors = []
|
||||
|
||||
def comment (self, data):
|
||||
"""
|
||||
|
|
@ -200,16 +189,6 @@ class HtmlPrettyPrinter (object):
|
|||
data = data.encode(self.encoding, "ignore")
|
||||
self.fd.write(data)
|
||||
|
||||
def error (self, msg):
|
||||
"""
|
||||
Store error message.
|
||||
|
||||
@param msg: message to print
|
||||
@type msg: string
|
||||
@return: None
|
||||
"""
|
||||
self.errors.append(msg)
|
||||
|
||||
|
||||
def quote_attrval (s):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -165,22 +165,6 @@ static PyObject* u_meta;
|
|||
goto label; \
|
||||
}
|
||||
|
||||
/* check the parser error list and call error callback */
|
||||
#define CHECK_PARSER_ERROR(ud, label) \
|
||||
if (PyObject_HasAttrString(ud->handler, "error") == 1) { \
|
||||
callback = PyObject_GetAttrString(ud->handler, "error"); \
|
||||
CHECK_ERROR((!callback), label); \
|
||||
for (int i=0; i < PyList_Size(ud->errors); i++) { \
|
||||
PyObject* msg = PyList_GetItem(ud->errors, i); \
|
||||
CHECK_ERROR((!msg), label); \
|
||||
result = PyObject_CallFunction(callback, "O", msg); \
|
||||
CHECK_ERROR((!result), label); \
|
||||
} \
|
||||
} \
|
||||
Py_DECREF(ud->errors); \
|
||||
ud->errors = PyList_New(0); \
|
||||
CHECK_ERROR((!ud->errors), label)
|
||||
|
||||
/* generic Python callback macro */
|
||||
#define CALLBACK(ud, attr, format, arg, label) \
|
||||
if (PyObject_HasAttrString(ud->handler, attr) == 1) { \
|
||||
|
|
@ -284,7 +268,7 @@ typedef int YYSTYPE;
|
|||
|
||||
|
||||
/* Line 213 of yacc.c. */
|
||||
#line 288 "htmlparse.c"
|
||||
#line 272 "htmlparse.c"
|
||||
|
||||
#if ! defined (yyoverflow) || YYERROR_VERBOSE
|
||||
|
||||
|
|
@ -457,8 +441,8 @@ static const yysigned_char yyrhs[] =
|
|||
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
|
||||
static const unsigned short int yyrline[] =
|
||||
{
|
||||
0, 198, 198, 201, 206, 210, 217, 259, 308, 345,
|
||||
365, 384, 404, 428, 453, 478
|
||||
0, 182, 182, 185, 190, 194, 201, 242, 290, 326,
|
||||
345, 363, 382, 405, 429, 453
|
||||
};
|
||||
#endif
|
||||
|
||||
|
|
@ -1193,21 +1177,21 @@ yyreduce:
|
|||
switch (yyn)
|
||||
{
|
||||
case 2:
|
||||
#line 198 "htmlparse.y"
|
||||
#line 182 "htmlparse.y"
|
||||
{
|
||||
/* parse a single element */
|
||||
;}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
#line 201 "htmlparse.y"
|
||||
#line 185 "htmlparse.y"
|
||||
{
|
||||
/* parse a list of elements */
|
||||
;}
|
||||
break;
|
||||
|
||||
case 4:
|
||||
#line 206 "htmlparse.y"
|
||||
#line 190 "htmlparse.y"
|
||||
{
|
||||
/* wait for more lexer input */
|
||||
YYACCEPT;
|
||||
|
|
@ -1215,7 +1199,7 @@ yyreduce:
|
|||
break;
|
||||
|
||||
case 5:
|
||||
#line 211 "htmlparse.y"
|
||||
#line 195 "htmlparse.y"
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1225,7 +1209,7 @@ yyreduce:
|
|||
break;
|
||||
|
||||
case 6:
|
||||
#line 218 "htmlparse.y"
|
||||
#line 202 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML start tag (eg. <a href="blubb">)
|
||||
$1 is a PyTuple (<tag>, <attrs>)
|
||||
|
|
@ -1254,7 +1238,6 @@ yyreduce:
|
|||
Py_CLEAR(callback);
|
||||
Py_CLEAR(result);
|
||||
}
|
||||
CHECK_PARSER_ERROR(ud, finish_start);
|
||||
finish_start:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -1270,7 +1253,7 @@ finish_start:
|
|||
break;
|
||||
|
||||
case 7:
|
||||
#line 260 "htmlparse.y"
|
||||
#line 243 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML start-end tag (eg. <br/>)
|
||||
$1 is a PyTuple (<tag>, <attrs>)
|
||||
|
|
@ -1306,7 +1289,6 @@ finish_start:
|
|||
Py_CLEAR(callback);
|
||||
Py_CLEAR(result);
|
||||
}
|
||||
CHECK_PARSER_ERROR(ud, finish_start_end);
|
||||
finish_start_end:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -1322,7 +1304,7 @@ finish_start_end:
|
|||
break;
|
||||
|
||||
case 8:
|
||||
#line 309 "htmlparse.y"
|
||||
#line 291 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML end tag (eg. </b>)
|
||||
$1 is a PyUnicode with the tag name */
|
||||
|
|
@ -1347,7 +1329,6 @@ finish_start_end:
|
|||
Py_CLEAR(callback);
|
||||
Py_CLEAR(result);
|
||||
}
|
||||
CHECK_PARSER_ERROR(ud, finish_end);
|
||||
finish_end:
|
||||
Py_XDECREF(tagname);
|
||||
Py_XDECREF(callback);
|
||||
|
|
@ -1362,7 +1343,7 @@ finish_end:
|
|||
break;
|
||||
|
||||
case 9:
|
||||
#line 346 "htmlparse.y"
|
||||
#line 327 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML comment (eg. <!-- bla -->)
|
||||
$1 is a PyUnicode with the comment content */
|
||||
|
|
@ -1371,7 +1352,6 @@ finish_end:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "comment", "O", (yyvsp[0]), finish_comment);
|
||||
CHECK_PARSER_ERROR(ud, finish_comment);
|
||||
finish_comment:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -1385,7 +1365,7 @@ finish_comment:
|
|||
break;
|
||||
|
||||
case 10:
|
||||
#line 366 "htmlparse.y"
|
||||
#line 346 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1393,7 +1373,6 @@ finish_comment:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "pi", "O", (yyvsp[0]), finish_pi);
|
||||
CHECK_PARSER_ERROR(ud, finish_pi);
|
||||
finish_pi:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -1407,7 +1386,7 @@ finish_pi:
|
|||
break;
|
||||
|
||||
case 11:
|
||||
#line 385 "htmlparse.y"
|
||||
#line 364 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML CDATA (eg. <![CDATA[spam and eggs ...]]>)
|
||||
$1 is a PyUnicode with the CDATA content */
|
||||
|
|
@ -1416,7 +1395,6 @@ finish_pi:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "cdata", "O", (yyvsp[0]), finish_cdata);
|
||||
CHECK_PARSER_ERROR(ud, finish_cdata);
|
||||
finish_cdata:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -1430,7 +1408,7 @@ finish_cdata:
|
|||
break;
|
||||
|
||||
case 12:
|
||||
#line 405 "htmlparse.y"
|
||||
#line 383 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML doctype (eg. <!DOCTYPE imadoofus system>)
|
||||
$1 is a PyUnicode with the doctype content */
|
||||
|
|
@ -1443,7 +1421,6 @@ finish_cdata:
|
|||
CHECK_ERROR((result == NULL), finish_doctype);
|
||||
Py_CLEAR(result);
|
||||
CALLBACK(ud, "doctype", "O", (yyvsp[0]), finish_doctype);
|
||||
CHECK_PARSER_ERROR(ud, finish_doctype);
|
||||
finish_doctype:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -1457,7 +1434,7 @@ finish_doctype:
|
|||
break;
|
||||
|
||||
case 13:
|
||||
#line 429 "htmlparse.y"
|
||||
#line 406 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML script content (plus end tag which is omitted)
|
||||
$1 is a PyUnicode with the script content */
|
||||
|
|
@ -1470,7 +1447,6 @@ finish_doctype:
|
|||
CALLBACK(ud, "characters", "O", (yyvsp[0]), finish_script);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", script, finish_script);
|
||||
CHECK_PARSER_ERROR(ud, finish_script);
|
||||
finish_script:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(script);
|
||||
|
|
@ -1485,7 +1461,7 @@ finish_script:
|
|||
break;
|
||||
|
||||
case 14:
|
||||
#line 454 "htmlparse.y"
|
||||
#line 430 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML style content (plus end tag which is omitted)
|
||||
$1 is a PyUnicode with the style content */
|
||||
|
|
@ -1498,7 +1474,6 @@ finish_script:
|
|||
CALLBACK(ud, "characters", "O", (yyvsp[0]), finish_style);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", style, finish_style);
|
||||
CHECK_PARSER_ERROR(ud, finish_style);
|
||||
finish_style:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(style);
|
||||
|
|
@ -1513,7 +1488,7 @@ finish_style:
|
|||
break;
|
||||
|
||||
case 15:
|
||||
#line 479 "htmlparse.y"
|
||||
#line 454 "htmlparse.y"
|
||||
{
|
||||
/* parsed HTML text data
|
||||
$1 is a PyUnicode with the text */
|
||||
|
|
@ -1524,7 +1499,6 @@ finish_style:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "characters", "O", (yyvsp[0]), finish_characters);
|
||||
CHECK_PARSER_ERROR(ud, finish_characters);
|
||||
finish_characters:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -1541,7 +1515,7 @@ finish_characters:
|
|||
}
|
||||
|
||||
/* Line 1037 of yacc.c. */
|
||||
#line 1545 "htmlparse.c"
|
||||
#line 1519 "htmlparse.c"
|
||||
|
||||
yyvsp -= yylen;
|
||||
yyssp -= yylen;
|
||||
|
|
@ -1769,7 +1743,7 @@ yyreturn:
|
|||
}
|
||||
|
||||
|
||||
#line 502 "htmlparse.y"
|
||||
#line 476 "htmlparse.y"
|
||||
|
||||
|
||||
/* create parser object */
|
||||
|
|
@ -1807,29 +1781,20 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
self->userData->exc_type = NULL;
|
||||
self->userData->exc_val = NULL;
|
||||
self->userData->exc_tb = NULL;
|
||||
self->userData->errors = PyList_New(0);
|
||||
if (self->userData->errors == NULL) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0) {
|
||||
Py_DECREF(self->userData->errors);
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->encoding = PyString_FromString("iso8859-1");
|
||||
if (self->encoding == NULL) {
|
||||
Py_DECREF(self->userData->errors);
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->doctype = PyString_FromString("HTML");
|
||||
if (self->doctype == NULL) {
|
||||
Py_DECREF(self->userData->errors);
|
||||
Py_DECREF(self->encoding);
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
|
|
|
|||
|
|
@ -78,22 +78,6 @@ static PyObject* u_meta;
|
|||
goto label; \
|
||||
}
|
||||
|
||||
/* check the parser error list and call error callback */
|
||||
#define CHECK_PARSER_ERROR(ud, label) \
|
||||
if (PyObject_HasAttrString(ud->handler, "error") == 1) { \
|
||||
callback = PyObject_GetAttrString(ud->handler, "error"); \
|
||||
CHECK_ERROR((!callback), label); \
|
||||
for (int i=0; i < PyList_Size(ud->errors); i++) { \
|
||||
PyObject* msg = PyList_GetItem(ud->errors, i); \
|
||||
CHECK_ERROR((!msg), label); \
|
||||
result = PyObject_CallFunction(callback, "O", msg); \
|
||||
CHECK_ERROR((!result), label); \
|
||||
} \
|
||||
} \
|
||||
Py_DECREF(ud->errors); \
|
||||
ud->errors = PyList_New(0); \
|
||||
CHECK_ERROR((!ud->errors), label)
|
||||
|
||||
/* generic Python callback macro */
|
||||
#define CALLBACK(ud, attr, format, arg, label) \
|
||||
if (PyObject_HasAttrString(ud->handler, attr) == 1) { \
|
||||
|
|
@ -243,7 +227,6 @@ element: T_WAIT {
|
|||
Py_CLEAR(callback);
|
||||
Py_CLEAR(result);
|
||||
}
|
||||
CHECK_PARSER_ERROR(ud, finish_start);
|
||||
finish_start:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -292,7 +275,6 @@ finish_start:
|
|||
Py_CLEAR(callback);
|
||||
Py_CLEAR(result);
|
||||
}
|
||||
CHECK_PARSER_ERROR(ud, finish_start_end);
|
||||
finish_start_end:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -330,7 +312,6 @@ finish_start_end:
|
|||
Py_CLEAR(callback);
|
||||
Py_CLEAR(result);
|
||||
}
|
||||
CHECK_PARSER_ERROR(ud, finish_end);
|
||||
finish_end:
|
||||
Py_XDECREF(tagname);
|
||||
Py_XDECREF(callback);
|
||||
|
|
@ -351,7 +332,6 @@ finish_end:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "comment", "O", $1, finish_comment);
|
||||
CHECK_PARSER_ERROR(ud, finish_comment);
|
||||
finish_comment:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -370,7 +350,6 @@ finish_comment:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "pi", "O", $1, finish_pi);
|
||||
CHECK_PARSER_ERROR(ud, finish_pi);
|
||||
finish_pi:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -390,7 +369,6 @@ finish_pi:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "cdata", "O", $1, finish_cdata);
|
||||
CHECK_PARSER_ERROR(ud, finish_cdata);
|
||||
finish_cdata:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -414,7 +392,6 @@ finish_cdata:
|
|||
CHECK_ERROR((result == NULL), finish_doctype);
|
||||
Py_CLEAR(result);
|
||||
CALLBACK(ud, "doctype", "O", $1, finish_doctype);
|
||||
CHECK_PARSER_ERROR(ud, finish_doctype);
|
||||
finish_doctype:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -438,7 +415,6 @@ finish_doctype:
|
|||
CALLBACK(ud, "characters", "O", $1, finish_script);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", script, finish_script);
|
||||
CHECK_PARSER_ERROR(ud, finish_script);
|
||||
finish_script:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(script);
|
||||
|
|
@ -463,7 +439,6 @@ finish_script:
|
|||
CALLBACK(ud, "characters", "O", $1, finish_style);
|
||||
/* emit the omitted end tag */
|
||||
CALLBACK(ud, "end_element", "O", style, finish_style);
|
||||
CHECK_PARSER_ERROR(ud, finish_style);
|
||||
finish_style:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(style);
|
||||
|
|
@ -486,7 +461,6 @@ finish_style:
|
|||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
CALLBACK(ud, "characters", "O", $1, finish_characters);
|
||||
CHECK_PARSER_ERROR(ud, finish_characters);
|
||||
finish_characters:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
|
|
@ -536,29 +510,20 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
self->userData->exc_type = NULL;
|
||||
self->userData->exc_val = NULL;
|
||||
self->userData->exc_tb = NULL;
|
||||
self->userData->errors = PyList_New(0);
|
||||
if (self->userData->errors == NULL) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0) {
|
||||
Py_DECREF(self->userData->errors);
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->encoding = PyString_FromString("iso8859-1");
|
||||
if (self->encoding == NULL) {
|
||||
Py_DECREF(self->userData->errors);
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->doctype = PyString_FromString("HTML");
|
||||
if (self->doctype == NULL) {
|
||||
Py_DECREF(self->userData->errors);
|
||||
Py_DECREF(self->encoding);
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
|
|
|
|||
|
|
@ -74,8 +74,6 @@ typedef struct {
|
|||
PyObject* exc_type;
|
||||
PyObject* exc_val;
|
||||
PyObject* exc_tb;
|
||||
/* list of errors */
|
||||
PyObject* errors;
|
||||
/* the parser object itself */
|
||||
PyObject* parser;
|
||||
} UserData;
|
||||
|
|
|
|||
Loading…
Reference in a new issue