/* Find recognizable tokens in (probably bad formatted) HTML streams. Unrecognizable character data is passed on as a TEXT token. */ %{ #include "htmlsax.h" #include #include #define YYSTYPE PyObject* #define YY_EXTRA_TYPE UserData* /* resize buffer b, returning T_ERROR on error */ #define RESIZE_BUF(b, n) \ b = PyMem_Resize(b, char, n); \ if (b==NULL) return T_ERROR; \ (b)[(n)-1] = '\0' /* make python string from tmp_buf and assign it to a */ #define PYSTRING_TMP(a) \ a = PyString_FromString(yyextra->tmp_buf); \ if (a==NULL) return T_ERROR /* set return value from tmp_buf */ #define SETLVAL {\ PyObject* s; \ PYSTRING_TMP(s); \ RESIZE_BUF(yyextra->tmp_buf, 1); \ *yylval = s; \ } /* append yytext to tmp_buf */ #define APPEND_TO_TMP(n) {\ size_t len = strlen(yyextra->tmp_buf); \ RESIZE_BUF(yyextra->tmp_buf, len+(n)+1); \ strncat(yyextra->tmp_buf, yytext, (size_t)n); \ } /* lowercase the tmp_buf */ #define LOWER_TMP {\ char* p = yyextra->tmp_buf; \ while (*p) { *p = tolower(*p); p++; } \ } /* check for JavaScript or CSS tags; must be before SET_ATTR_LVAL */ #define SCRIPT_CHECK \ if (strcmp("script", PyString_AS_STRING(yyextra->tmp_tag))==0) \ BEGIN(S_SCRIPT); \ else if (strcmp("style", PyString_AS_STRING(yyextra->tmp_tag))==0) \ BEGIN(S_STYLE); \ else \ BEGIN(INITIAL) /* set return value from tag with attributes */ #define SET_ATTR_LVAL \ if (yyextra->tmp_tag==NULL || yyextra->tmp_attrs==NULL) { \ PyErr_SetString(PyExc_TypeError, "tmp_tag or tmp_attrs is NULL"); \ return T_ERROR; \ } \ *yylval = Py_BuildValue("(OO)", yyextra->tmp_tag, yyextra->tmp_attrs); \ if ((*yylval)==NULL) return T_ERROR; \ yyextra->tmp_tag = yyextra->tmp_attrs = NULL /* store collected name as attribute in dictionary * tmp_attrname and tmp_attrval must be NULL */ #define FLUSH_ATTRS \ if (strlen(yyextra->tmp_buf) > 0) { \ PYSTRING_TMP(yyextra->tmp_attrname); \ RESIZE_BUF(yyextra->tmp_buf, 1); \ if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None)==-1) return T_ERROR; \ Py_DECREF(yyextra->tmp_attrname); \ yyextra->tmp_attrname = NULL; \ } /* update the buffer and scanner positions */ #define UPDATE_BUFPOS yyextra->bufpos += yyleng; yyextra->pos += yyleng /* update the column position; use this *only* in rules that cannot match the newline char '\n'! */ #define UPDATE_COLUMN yyextra->column += yyleng /* update the line and column position; use this in rules that can match the newline char '\n'. */ #define UPDATE_LINE { \ int i; \ for (i=0; ilineno); \ yyextra->column = 1; \ } \ else ++(yyextra->column); \ } \ } /* return a token, setting the nextpos value back to the bufpos */ #define RETURN(tok) yyextra->nextpos = yyextra->bufpos; return tok /* use Pythons memory management */ void* yyalloc (yy_size_t bytes, void* yyscanner) { return PyMem_Malloc((size_t)bytes); } void* yyrealloc (void* ptr, yy_size_t bytes, void* yyscanner) { return PyMem_Realloc(ptr, (size_t)bytes); } void yyfree (void* ptr, void* yyscanner) { PyMem_Free(ptr); } #include "htmlparse.h" /* Find out if the given HTML attribute val must be quoted. The string will be surrounded by double quotes if it contains white space or <> or ends with /. All double quotes inside the string will be replaced with ". val must be a Python String object */ static PyObject* quote_string (PyObject* val) { int quote = 0; int replace = 0; int len = PyString_GET_SIZE(val); char* internal = PyString_AS_STRING(val); int i; PyObject* prefix; for (i=0; i' || internal[i]=='\'') { quote = 1; } else if (internal[i]=='"') { replace = 1; } } if (len==0) { /* its an empty string */ quote = 1; } else if (internal[len-1]=='/') { quote = 1; } if (replace) { PyObject* nval = PyObject_CallMethod(val, "replace", "ss", "\"", """); Py_DECREF(val); val = nval; } if (quote==0) { return val; } /* quote suffix */ if ((prefix = PyString_FromString("\""))==NULL) return NULL; PyString_Concat(&val, prefix); if (val==NULL) { Py_DECREF(prefix); return NULL; } /* quote prefix */ PyString_ConcatAndDel(&prefix, val); if (prefix==NULL) { Py_DECREF(val); return NULL; } return prefix; } %} %option noyyalloc noyyrealloc noyyfree %option 8bit outfile="htmllex.c" %option align full /* uncomment the next line for debugging */ %option debug %option nounput nomain noyywrap noyymore noreject %option bison-bridge reentrant never-interactive %option warn %x S_PI %x S_COMMENT %x S_COMMENT2 %x S_DOCTYPE %x S_CDATA %x S_TAGSTART %x S_TAGEND %x S_TAGEND2 %x S_SCRIPT %x S_SCRIPT_APOS %x S_SCRIPT_APOS_ESC %x S_SCRIPT_STRING %x S_SCRIPT_STRING_ESC %x S_SCRIPT_COMMENT %x S_SCRIPT_MCOMMENT %x S_STYLE %x S_ATTR1 %x S_ATTR2 %x S_ATTR3 %x S_ATTR4 %x S_ATTR5 %x S_APOSSTRING %x S_APOSSTRING_ESC %x S_STRING %x S_STRING_ESC RX_WHITE_SPACE [\n\r\ \t\b\012] RX_EQUAL = RX_NAME [a-zA-Z]([-a-zA-Z0-9_])* RX_DATA [-a-zA-Z0-9_:]+ %% /*********************** EOF ************************/ <> { /* wait for more data */ return T_WAIT; } /*********************** COMMENT ************************/ /* Note: eonline had some "-*--> { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng-3); SETLVAL; BEGIN(INITIAL); RETURN(T_COMMENT); } [^-]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } -+[^->]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } . { return T_WAIT; } /* Note: www.nba.com ad some comment */ !> { UPDATE_BUFPOS; SETLVAL; BEGIN(INITIAL); RETURN(T_COMMENT); } [^!]+ { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); } ![^>]+ { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); } . { return T_WAIT; } /*********************** DOCTYPE ************************/ > { UPDATE_BUFPOS; UPDATE_COLUMN; SETLVAL; BEGIN(INITIAL); RETURN(T_DOCTYPE); } [^>]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } /*********************** CDATA ************************/ \]*\]\]> { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng-3); SETLVAL; BEGIN(INITIAL); RETURN(T_CDATA); } [^\]]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } \]+[^>\]]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } . { return T_WAIT; } /*********************** PI ************************/ <\? { UPDATE_BUFPOS; UPDATE_COLUMN; BEGIN(S_PI); } \?*\?> { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng-2); SETLVAL; BEGIN(INITIAL); RETURN(T_PI); } [^?]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } \?+[^?>]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } . { return T_WAIT; } /*********************** TAGSTART ************************/ <{RX_WHITE_SPACE}*/[A-Za-z] { UPDATE_BUFPOS; UPDATE_LINE; yyextra->tmp_attrs = PyDict_New(); if (yyextra->tmp_attrs==NULL) return T_ERROR; BEGIN(S_TAGSTART); } [^ \t\r\n\b\012/<>]+ { /* actually accept a lot of tag chars, which may be illegal, but we dont care, its the browsers job */ UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng); } {RX_WHITE_SPACE}+ { UPDATE_BUFPOS; UPDATE_LINE; LOWER_TMP; PYSTRING_TMP(yyextra->tmp_tag); RESIZE_BUF(yyextra->tmp_buf, 1); BEGIN(S_ATTR1); } \/> { UPDATE_BUFPOS; UPDATE_COLUMN; BEGIN(INITIAL); if (!strlen(yyextra->tmp_buf)) { RESIZE_BUF(yyextra->tmp_buf, 4); strcpy(yyextra->tmp_buf, ""); yyextra->tmp_attrs = NULL; SETLVAL; RETURN(T_TEXT); } LOWER_TMP; PYSTRING_TMP(yyextra->tmp_tag); RESIZE_BUF(yyextra->tmp_buf, 1); SET_ATTR_LVAL; RETURN(T_ELEMENT_START_END); } > { UPDATE_BUFPOS; UPDATE_COLUMN; if (!strlen(yyextra->tmp_buf)) { RESIZE_BUF(yyextra->tmp_buf, 3); strcpy(yyextra->tmp_buf, "<>"); yyextra->tmp_attrs = NULL; SETLVAL; RETURN(T_TEXT); } LOWER_TMP; PYSTRING_TMP(yyextra->tmp_tag); RESIZE_BUF(yyextra->tmp_buf, 1); SCRIPT_CHECK; SET_ATTR_LVAL; RETURN(T_ELEMENT_START); } . { return T_WAIT; } /*********************** SCRIPT ************************/ <\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*> { UPDATE_BUFPOS; UPDATE_LINE; SETLVAL; BEGIN(INITIAL); RETURN(T_SCRIPT); } [^/'"<]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } \' { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_APOS); } \" { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_STRING); } \/\/ { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_COMMENT); } \/\* { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_MCOMMENT); } \/[^/*] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } /* this is so shitty */ <\/{RX_WHITE_SPACE}*/[^Ss] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss]/[^Cc] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Cc]/[^Rr] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Cc][Rr]/[^Ii] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii]/[^Pp] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp]/[^Tt] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*/[^>] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } . { return T_WAIT; } \\ { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_APOS_ESC); } [^\\']+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } \' { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT); } . { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_APOS); } \\ { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_STRING_ESC); } [^\\"]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } \" { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT); } . { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT_STRING); } [^\-\n]+ { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); } \n { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT); } -([^-\n]+|-[^>\n]+) { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } --> { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT); } . { return T_WAIT; } [^*]+|\*[^/]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } \*\/ { UPDATE_BUFPOS; APPEND_TO_TMP(yyleng); BEGIN(S_SCRIPT); } . { return T_WAIT; } /*********************** STYLE ************************/ <\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*> { UPDATE_BUFPOS; UPDATE_LINE; SETLVAL; BEGIN(INITIAL); RETURN(T_STYLE); } [^<]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } /* this is so shitty */ <\/{RX_WHITE_SPACE}*/[^Ss] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss]/[^Tt] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Tt]/[^Yy] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Tt][Yy]/[^Ll] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll]/[^Ee] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } <\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*/[^>] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } . { return T_WAIT; } /*********************** ATTRS ************************/ {RX_NAME} { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng); BEGIN(S_ATTR2); } \/> { UPDATE_BUFPOS; UPDATE_COLUMN; FLUSH_ATTRS; BEGIN(INITIAL); SET_ATTR_LVAL; RETURN(T_ELEMENT_START_END); } > { UPDATE_BUFPOS; UPDATE_COLUMN; FLUSH_ATTRS; SCRIPT_CHECK; SET_ATTR_LVAL; RETURN(T_ELEMENT_START); } {RX_DATA} { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng); } {RX_WHITE_SPACE}+ { UPDATE_BUFPOS; UPDATE_LINE; BEGIN(S_ATTR3); } {RX_WHITE_SPACE}*{RX_EQUAL}{RX_WHITE_SPACE}* { UPDATE_BUFPOS; UPDATE_LINE; LOWER_TMP; PYSTRING_TMP(yyextra->tmp_attrname); RESIZE_BUF(yyextra->tmp_buf, 1); BEGIN(S_ATTR4); } {RX_NAME} { UPDATE_BUFPOS; UPDATE_COLUMN; LOWER_TMP; PYSTRING_TMP(yyextra->tmp_attrname); RESIZE_BUF(yyextra->tmp_buf, 1); if (yyextra->tmp_attrval!=NULL) return T_ERROR; Py_INCREF(Py_None); yyextra->tmp_attrval = Py_None; if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrname = yyextra->tmp_attrval = NULL; APPEND_TO_TMP(yyleng); BEGIN(S_ATTR2); } .|\n { /* this also skips whitespace! */ UPDATE_BUFPOS; UPDATE_LINE; } \\\" { /* backslash escapes seen at freshmeat.net */ UPDATE_BUFPOS; UPDATE_COLUMN; BEGIN(S_STRING); } \" { UPDATE_BUFPOS; UPDATE_COLUMN; BEGIN(S_STRING); } \' { UPDATE_BUFPOS; UPDATE_COLUMN; BEGIN(S_APOSSTRING); } [^\012 \t\b\r\n>\'\"]+ { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng); BEGIN(S_ATTR5); } > { UPDATE_BUFPOS; UPDATE_COLUMN; PYSTRING_TMP(yyextra->tmp_attrval); RESIZE_BUF(yyextra->tmp_buf, 1); yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval); if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrname = yyextra->tmp_attrval = NULL; SCRIPT_CHECK; SET_ATTR_LVAL; RETURN(T_ELEMENT_START); } \/> { UPDATE_BUFPOS; UPDATE_COLUMN; PYSTRING_TMP(yyextra->tmp_attrval); RESIZE_BUF(yyextra->tmp_buf, 1); yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval); if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrname = yyextra->tmp_attrval = NULL; BEGIN(INITIAL); SET_ATTR_LVAL; RETURN(T_ELEMENT_START_END); } {RX_WHITE_SPACE}+ { UPDATE_BUFPOS; UPDATE_LINE; } [^\012 \t\b\r\n>]+ { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng); } > { UPDATE_BUFPOS; UPDATE_COLUMN; PYSTRING_TMP(yyextra->tmp_attrval); RESIZE_BUF(yyextra->tmp_buf, 1); yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval); if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrname = yyextra->tmp_attrval = NULL; SCRIPT_CHECK; SET_ATTR_LVAL; RETURN(T_ELEMENT_START); } \/> { UPDATE_BUFPOS; UPDATE_COLUMN; PYSTRING_TMP(yyextra->tmp_attrval); RESIZE_BUF(yyextra->tmp_buf, 1); yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval); if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrname = yyextra->tmp_attrval = NULL; BEGIN(INITIAL); SET_ATTR_LVAL; RETURN(T_ELEMENT_START_END); } {RX_WHITE_SPACE}+ { UPDATE_BUFPOS; UPDATE_LINE; PYSTRING_TMP(yyextra->tmp_attrval); RESIZE_BUF(yyextra->tmp_buf, 1); yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval); if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrname = yyextra->tmp_attrval = NULL; BEGIN(S_ATTR1); } \' { UPDATE_BUFPOS; UPDATE_COLUMN; PYSTRING_TMP(yyextra->tmp_attrval); RESIZE_BUF(yyextra->tmp_buf, 1); yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval); if (!yyextra->tmp_attrval) return T_ERROR; if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrname = yyextra->tmp_attrval = NULL; BEGIN(S_ATTR1); } [^']+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } \" { UPDATE_BUFPOS; UPDATE_COLUMN; PYSTRING_TMP(yyextra->tmp_attrval); RESIZE_BUF(yyextra->tmp_buf, 1); yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval); if (!yyextra->tmp_attrval) { return T_ERROR; } if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR; Py_DECREF(yyextra->tmp_attrname); Py_DECREF(yyextra->tmp_attrval); yyextra->tmp_attrval = yyextra->tmp_attrname = NULL; BEGIN(S_ATTR1); } [^"]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); } /*********************** TAGEND ************************/ <{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] { UPDATE_BUFPOS; UPDATE_LINE; BEGIN(S_TAGEND); } [^<>\r\n \t\b\012]+ { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng); } > { UPDATE_BUFPOS; UPDATE_COLUMN; LOWER_TMP; SETLVAL; BEGIN(INITIAL); RETURN(T_ELEMENT_END); } <{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] { UPDATE_BUFPOS; UPDATE_LINE; LOWER_TMP; yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf); SETLVAL; BEGIN(S_TAGEND); RETURN(T_ELEMENT_END); } <{RX_WHITE_SPACE}*/[A-Za-z] { UPDATE_BUFPOS; UPDATE_LINE; LOWER_TMP; yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf); SETLVAL; if (!(yyextra->tmp_attrs = PyDict_New())) return T_ERROR; BEGIN(S_TAGSTART); RETURN(T_ELEMENT_END); } {RX_WHITE_SPACE}+ { UPDATE_BUFPOS; UPDATE_LINE; /* ignore any trailing garbage of this end tag */ BEGIN(S_TAGEND2); } > { UPDATE_BUFPOS; UPDATE_COLUMN; LOWER_TMP; SETLVAL; BEGIN(INITIAL); RETURN(T_ELEMENT_END); } [^<>]+ { UPDATE_BUFPOS; UPDATE_COLUMN; } <{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] { UPDATE_BUFPOS; UPDATE_LINE; LOWER_TMP; yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf); SETLVAL; BEGIN(S_TAGEND); RETURN(T_ELEMENT_END); } <{RX_WHITE_SPACE}*/[A-Za-z] { UPDATE_BUFPOS; UPDATE_LINE; LOWER_TMP; yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf); SETLVAL; if (!(yyextra->tmp_attrs = PyDict_New())) return T_ERROR; BEGIN(S_TAGSTART); RETURN(T_ELEMENT_END); } . { return T_WAIT; } /*********************** TEXT ************************/ [^<]+ { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); SETLVAL; RETURN(T_TEXT); } <[^\012 \t\b\r\nA-Za-z!?/] { UPDATE_BUFPOS; UPDATE_COLUMN; APPEND_TO_TMP(yyleng); SETLVAL; RETURN(T_TEXT); } <{RX_WHITE_SPACE}+[^A-Za-z/] { UPDATE_BUFPOS; UPDATE_LINE; APPEND_TO_TMP(yyleng); SETLVAL; RETURN(T_TEXT); } . { return T_WAIT; } %% int htmllexInit (void** scanner, UserData* data) { yylex_init(scanner); yyset_extra(data, *scanner); return 0; } int htmllexDebug (void** scanner, int debug) { int old = yyget_debug(*scanner); yyset_debug(debug, *scanner); return old; } /* prepare scanner for calls to yylex() */ int htmllexStart (void* scanner, UserData* data, const char* s, int slen) { /* append s to data buffer and scan those bytes. As Flex does not distinguish between NUL and EOF characters, we must replace NUL with ' '. */ int len = strlen(data->buf); int i; RESIZE_BUF(data->buf, len+slen+1); for (i=0; ibuf[len+i] = (s[i]==0 ? ' ' : s[i]); } data->buf[len+slen] = '\0'; if (len > data->bufpos) { int rewind = len - data->bufpos; slen += rewind; len -= rewind; } /* reset userdata */ data->bufpos = len; data->exc_type = NULL; data->exc_val = NULL; data->exc_tb = NULL; if (yyget_debug(scanner)) { fprintf(stderr, "SCANNING `%s'\n", data->buf+len); } data->lexbuf = yy_scan_bytes(data->buf+len, slen, scanner); return 0; } /* delete scanned buffer data */ int htmllexStop (void* scanner, UserData* data) { yy_delete_buffer(data->lexbuf, scanner); if (data->nextpos > 0) { int len = strlen(data->buf); int i, j; for (i=data->nextpos,j=0; ibuf[j] = data->buf[i]; } data->buf[j] = '\0'; RESIZE_BUF(data->buf, len-data->nextpos+1); data->bufpos -= data->nextpos; data->nextpos = 0; } return 0; } int htmllexDestroy (void* scanner) { yylex_destroy(scanner); return 0; }