merged from webcleaner

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1205 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-01-28 23:38:00 +00:00
parent f4dde29117
commit 4df200a2d2
5 changed files with 5290 additions and 5392 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,4 @@
/* Find recognizable tokens in (probably bad formatted) HTML streams.
Unrecognizable character data is passed on as a TEXT token.
Copyright (C) 2000-2004 Bastian Kleineidam
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -17,6 +14,9 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* Find recognizable tokens in (probably bad formatted) HTML streams.
Unrecognizable character data is passed on as a TEXT token.
*/
%{
#include "htmlsax.h"
@ -92,15 +92,17 @@
}
/* update the buffer and scanner positions */
#define UPDATE_BUFPOS yyextra->bufpos += yyleng; yyextra->pos += yyleng
#define _UPDATE_BUFPOS yyextra->bufpos += yyleng; yyextra->pos += yyleng
/* update the column position; use this *only* in rules that cannot match
the newline char '\n'!
*/
#define UPDATE_COLUMN yyextra->column += yyleng
#define UPDATE_COLUMN _UPDATE_BUFPOS; yyextra->column += yyleng
/* update the line and column position; use this in rules that can match the
newline char '\n'.
*/
#define UPDATE_LINE { \
#define UPDATE_LINE _UPDATE_BUFPOS; { \
int i; \
for (i=0; i<yyleng; ++i) { \
if (yytext[i] == '\n') { \
@ -175,9 +177,9 @@ void yyfree (void* ptr, void* yyscanner) {
/* regular expression definitions used below */
RX_WHITE_SPACE [\n\r\ \t\b\012]
RX_EQUAL =
RX_NAME [a-zA-Z]([-a-zA-Z0-9_])*
RX_DATA [-a-zA-Z0-9_:]+
RX_EQUAL =
RX_NAME [a-zA-Z]([-a-zA-Z0-9_])*
RX_DATA [-a-zA-Z0-9_:]+
%%
@ -191,13 +193,11 @@ RX_DATA [-a-zA-Z0-9_:]+
/* Note: eonline had some "<! --" comments */
/* Note: a bug report about "<!- " comments was filed */
<INITIAL><![ ]*-[- ] {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_COMMENT);
}
<S_COMMENT>-*--> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng-3);
SETLVAL;
@ -206,13 +206,11 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_COMMENT>[^-]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>-+[^->]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
@ -223,24 +221,24 @@ RX_DATA [-a-zA-Z0-9_:]+
/* Note: www.nba.com had some <! Copyright !> comment */
<INITIAL><![ ]+ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_COMMENT2);
}
<S_COMMENT2>!> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
SETLVAL;
BEGIN(INITIAL);
RETURN(T_COMMENT);
}
<S_COMMENT2>[^!]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT2>![^>]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
@ -251,13 +249,11 @@ RX_DATA [-a-zA-Z0-9_:]+
/*********************** DOCTYPE ************************/
<INITIAL><![Dd][Oo][Cc][Tt][Yy][Pp][Ee] {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_DOCTYPE);
}
<S_DOCTYPE>> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
SETLVAL;
BEGIN(INITIAL);
@ -265,20 +261,17 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_DOCTYPE>[^>]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
/*********************** CDATA ************************/
<INITIAL><!\[CDATA\[ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_CDATA);
}
<S_CDATA>\]*\]\]> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng-3);
SETLVAL;
@ -287,13 +280,11 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_CDATA>[^\]]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_CDATA>\]+[^>\]]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
@ -304,13 +295,16 @@ RX_DATA [-a-zA-Z0-9_:]+
/*********************** PI ************************/
<INITIAL><\? {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_PI);
}
<S_PI>\?*\?> {
UPDATE_BUFPOS;
<S_PI>[^?>]+ {
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_PI>\?+> {
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng-2);
SETLVAL;
@ -318,16 +312,16 @@ RX_DATA [-a-zA-Z0-9_:]+
RETURN(T_PI);
}
<S_PI>[^?]+ {
UPDATE_BUFPOS;
<S_PI>\?+[^?>]+ {
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_PI>\?+[^?>]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
<S_PI>> {
UPDATE_COLUMN;
SETLVAL;
BEGIN(INITIAL);
RETURN(T_PI);
}
<S_PI>. {
@ -337,23 +331,21 @@ RX_DATA [-a-zA-Z0-9_:]+
/*********************** TAGSTART ************************/
<INITIAL><{RX_WHITE_SPACE}*/[A-Za-z] {
UPDATE_BUFPOS;
UPDATE_LINE;
yyextra->tmp_attrs = PyDict_New();
if (yyextra->tmp_attrs==NULL) return T_ERROR;
if ((yyextra->tmp_attrs = PyDict_New())==NULL) {
return T_ERROR;
}
BEGIN(S_TAGSTART);
}
<S_TAGSTART>[^ \t\r\n\b\012/<>]+ {
/* actually accept a lot of tag chars, which may be illegal,
but we dont care, its the browsers job */
UPDATE_BUFPOS;
but we dont care, it's the browsers job */
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_TAGSTART>{RX_WHITE_SPACE}+ {
UPDATE_BUFPOS;
UPDATE_LINE;
LOWER_TMP;
PYSTRING_TMP(yyextra->tmp_tag);
@ -362,7 +354,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_TAGSTART>\/> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(INITIAL);
if (!strlen(yyextra->tmp_buf)) {
@ -381,11 +372,10 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_TAGSTART>> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(INITIAL);
if (!strlen(yyextra->tmp_buf)) {
/* the tag name was empty, assume a stray "</>" */
/* the tag name was empty, assume a stray "<>" */
RESIZE_BUF(yyextra->tmp_buf, 3);
strcpy(yyextra->tmp_buf, "<>");
yyextra->tmp_attrs = NULL;
@ -406,7 +396,6 @@ RX_DATA [-a-zA-Z0-9_:]+
/*********************** SCRIPT ************************/
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*> {
UPDATE_BUFPOS;
UPDATE_LINE;
SETLVAL;
BEGIN(INITIAL);
@ -414,86 +403,76 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_SCRIPT>[^/'"<]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT>\' {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_APOS);
}
<S_SCRIPT>\" {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_STRING);
}
<S_SCRIPT>\/\/ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_COMMENT);
}
<S_SCRIPT>\/\* {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_MCOMMENT);
}
<S_SCRIPT>\/[^/*] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
/* XXX this is so shitty */
<S_SCRIPT></[^/] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT><\/{RX_WHITE_SPACE}*/[^Ss] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss]/[^Cc] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc]/[^Rr] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr]/[^Ii] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii]/[^Pp] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp]/[^Tt] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*/[^>] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
@ -503,75 +482,69 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_SCRIPT_APOS>\\ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_APOS_ESC);
}
<S_SCRIPT_APOS>[^\\']+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT_APOS>\' {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT);
}
<S_SCRIPT_APOS_ESC>. {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_APOS);
}
<S_SCRIPT_STRING>\\ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_STRING_ESC);
}
<S_SCRIPT_STRING>[^\\"]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT_STRING>\" {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT);
}
<S_SCRIPT_STRING_ESC>. {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT_STRING);
}
<S_SCRIPT_COMMENT>[^\-\r\n]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT_COMMENT>[\r\n] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT);
}
<S_SCRIPT_COMMENT>-([^-\r\n]+|-[^>\r\n]+) {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT_COMMENT>--> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT);
}
@ -581,20 +554,18 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_SCRIPT_MCOMMENT>[^*]+|\* {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_SCRIPT_MCOMMENT>\*\/ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_SCRIPT);
}
/*********************** STYLE ************************/
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*> {
UPDATE_BUFPOS;
UPDATE_LINE;
SETLVAL;
BEGIN(INITIAL);
@ -602,50 +573,42 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_STYLE>[^<]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
/* this is so shitty */
<S_STYLE></[^/] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_STYLE><\/{RX_WHITE_SPACE}*/[^Ss] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss]/[^Tt] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt]/[^Yy] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy]/[^Ll] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll]/[^Ee] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*/[^>] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
@ -656,14 +619,12 @@ RX_DATA [-a-zA-Z0-9_:]+
/*********************** ATTRS ************************/
<S_ATTR1>{RX_NAME} {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_ATTR2);
}
<S_ATTR1,S_ATTR2,S_ATTR3>\/> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
FLUSH_ATTRS;
BEGIN(INITIAL);
@ -672,7 +633,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_ATTR1,S_ATTR2,S_ATTR3>> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
FLUSH_ATTRS;
SCRIPT_CHECK;
@ -681,19 +641,16 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_ATTR2>{RX_DATA} {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_ATTR2>{RX_WHITE_SPACE}+ {
UPDATE_BUFPOS;
UPDATE_LINE;
BEGIN(S_ATTR3);
}
<S_ATTR2,S_ATTR3>{RX_WHITE_SPACE}*{RX_EQUAL}{RX_WHITE_SPACE}* {
UPDATE_BUFPOS;
UPDATE_LINE;
LOWER_TMP;
PYSTRING_TMP(yyextra->tmp_attrname);
@ -702,7 +659,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_ATTR3>{RX_NAME} {
UPDATE_BUFPOS;
UPDATE_COLUMN;
LOWER_TMP;
PYSTRING_TMP(yyextra->tmp_attrname);
@ -710,9 +666,10 @@ RX_DATA [-a-zA-Z0-9_:]+
if (yyextra->tmp_attrval!=NULL) return T_ERROR;
Py_INCREF(Py_None);
yyextra->tmp_attrval = Py_None;
if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR;
Py_DECREF(yyextra->tmp_attrname);
Py_DECREF(yyextra->tmp_attrval);
if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
/*Py_DECREF(yyextra->tmp_attrname);*/
/*Py_DECREF(yyextra->tmp_attrval);*/
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
APPEND_TO_TMP(yyleng);
BEGIN(S_ATTR2);
@ -720,40 +677,34 @@ RX_DATA [-a-zA-Z0-9_:]+
<S_ATTR1,S_ATTR2,S_ATTR3>.|\n {
/* this also skips whitespace! */
UPDATE_BUFPOS;
UPDATE_LINE;
}
<S_ATTR4>\\\" {
/* backslash escapes seen at freshmeat.net */
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_STRING);
}
<S_ATTR4>\" {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_STRING);
}
<S_ATTR4>\' {
UPDATE_BUFPOS;
UPDATE_COLUMN;
BEGIN(S_APOSSTRING);
}
<S_ATTR4>[^\012 \t\b\r\n>\'\"]+ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_ATTR5);
}
<S_ATTR4>> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
@ -771,7 +722,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_ATTR4>\/> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
@ -780,8 +730,8 @@ RX_DATA [-a-zA-Z0-9_:]+
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
Py_DECREF(yyextra->tmp_attrname);
Py_DECREF(yyextra->tmp_attrval);
/*Py_DECREF(yyextra->tmp_attrname);*/
/*Py_DECREF(yyextra->tmp_attrval);*/
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
BEGIN(INITIAL);
SET_ATTR_LVAL;
@ -789,18 +739,15 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_ATTR4>{RX_WHITE_SPACE}+ {
UPDATE_BUFPOS;
UPDATE_LINE;
}
<S_ATTR5>[^\012 \t\b\r\n>]+ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_ATTR5>> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
@ -818,7 +765,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_ATTR5>\/> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
@ -836,7 +782,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_ATTR5>{RX_WHITE_SPACE}+ {
UPDATE_BUFPOS;
UPDATE_LINE;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
@ -852,14 +797,12 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_APOSSTRING>\\ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_APOSSTRING_ESC);
}
<S_APOSSTRING>\' {
UPDATE_BUFPOS;
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
@ -875,28 +818,24 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_APOSSTRING>[^']+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_APOSSTRING_ESC>. {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
BEGIN(S_APOSSTRING);
}
<S_STRING>\\ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
BEGIN(S_STRING_ESC);
}
<S_STRING>\" {
UPDATE_BUFPOS;
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
@ -912,13 +851,11 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_STRING>[^"]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_STRING_ESC>. {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
BEGIN(S_STRING);
@ -927,19 +864,16 @@ RX_DATA [-a-zA-Z0-9_:]+
/*********************** TAGEND ************************/
<INITIAL><{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
UPDATE_BUFPOS;
UPDATE_LINE;
BEGIN(S_TAGEND);
}
<S_TAGEND>[^<>\r\n \t\b\012]+ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_TAGEND>{RX_WHITE_SPACE}*> {
UPDATE_BUFPOS;
UPDATE_LINE;
LOWER_TMP;
SETLVAL;
@ -948,7 +882,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_TAGEND><{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
UPDATE_BUFPOS;
UPDATE_LINE;
LOWER_TMP;
yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf);
@ -958,7 +891,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_TAGEND><{RX_WHITE_SPACE}*/[A-Za-z] {
UPDATE_BUFPOS;
UPDATE_LINE;
LOWER_TMP;
yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf);
@ -969,14 +901,12 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_TAGEND>{RX_WHITE_SPACE}+ {
UPDATE_BUFPOS;
UPDATE_LINE;
/* ignore any trailing garbage of this end tag */
BEGIN(S_TAGEND2);
}
<S_TAGEND2>> {
UPDATE_BUFPOS;
UPDATE_COLUMN;
LOWER_TMP;
SETLVAL;
@ -985,12 +915,10 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_TAGEND2>[^<>]+ {
UPDATE_BUFPOS;
UPDATE_COLUMN;
UPDATE_LINE;
}
<S_TAGEND2><{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
UPDATE_BUFPOS;
UPDATE_LINE;
LOWER_TMP;
yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf);
@ -1000,12 +928,13 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<S_TAGEND2><{RX_WHITE_SPACE}*/[A-Za-z] {
UPDATE_BUFPOS;
UPDATE_LINE;
LOWER_TMP;
yyextra->error = PyString_FromFormat("missing > in end tag `%s'", yyextra->tmp_buf);
SETLVAL;
if (!(yyextra->tmp_attrs = PyDict_New())) return T_ERROR;
if ((yyextra->tmp_attrs = PyDict_New())==NULL) {
return T_ERROR;
}
BEGIN(S_TAGSTART);
RETURN(T_ELEMENT_END);
}
@ -1013,10 +942,8 @@ RX_DATA [-a-zA-Z0-9_:]+
<S_TAGEND2>. {
return T_WAIT;
}
/*********************** TEXT ************************/
<INITIAL>[^<]+ {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
SETLVAL;
@ -1024,7 +951,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<INITIAL><[^\012 \t\b\r\nA-Za-z!?/] {
UPDATE_BUFPOS;
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
SETLVAL;
@ -1032,7 +958,6 @@ RX_DATA [-a-zA-Z0-9_:]+
}
<INITIAL><{RX_WHITE_SPACE}+[^A-Za-z/] {
UPDATE_BUFPOS;
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
SETLVAL;

View file

@ -84,10 +84,9 @@
/* Copy the first part of user declarations. */
#line 2 "htmlparse.y"
#line 1 "htmlparse.y"
/* SAX parser, optimized for WebCleaner
Copyright (C) 2000-2004 Bastian Kleineidam
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -162,7 +161,6 @@ static PyObject* resolve_entities;
if (b==NULL) { Py_DECREF(self); return NULL; } \
(b)[0] = '\0'
/* call error handler if error object is not NULL */
#define CHECK_ERROR(ud, label) \
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
callback = PyObject_GetAttrString(ud->handler, "error"); \
@ -231,7 +229,7 @@ typedef int YYSTYPE;
/* Line 214 of yacc.c. */
#line 235 "htmlparse.c"
#line 233 "htmlparse.c"
#if ! defined (yyoverflow) || YYERROR_VERBOSE
@ -401,8 +399,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short yyrline[] =
{
0, 146, 146, 147, 150, 151, 158, 192, 238, 268,
288, 308, 328, 348, 369, 390
0, 143, 143, 144, 147, 148, 155, 190, 237, 268,
289, 310, 331, 352, 374, 396
};
#endif
@ -1107,22 +1105,22 @@ yyreduce:
switch (yyn)
{
case 2:
#line 146 "htmlparse.y"
#line 143 "htmlparse.y"
{;}
break;
case 3:
#line 147 "htmlparse.y"
#line 144 "htmlparse.y"
{;}
break;
case 4:
#line 150 "htmlparse.y"
#line 147 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
break;
case 5:
#line 152 "htmlparse.y"
#line 149 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1132,9 +1130,10 @@ yyreduce:
break;
case 6:
#line 159 "htmlparse.y"
#line 156 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1169,9 +1168,10 @@ finish_start:
break;
case 7:
#line 193 "htmlparse.y"
#line 191 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1200,7 +1200,7 @@ finish_start:
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_start);
CHECK_ERROR(ud, finish_start_end);
finish_start_end:
Py_XDECREF(ud->error);
ud->error = NULL;
@ -1218,8 +1218,9 @@ finish_start_end:
break;
case 8:
#line 239 "htmlparse.y"
#line 238 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1253,6 +1254,7 @@ finish_end:
case 9:
#line 269 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1274,8 +1276,9 @@ finish_comment:
break;
case 10:
#line 289 "htmlparse.y"
#line 290 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1297,8 +1300,9 @@ finish_pi:
break;
case 11:
#line 309 "htmlparse.y"
#line 311 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1320,8 +1324,9 @@ finish_cdata:
break;
case 12:
#line 329 "htmlparse.y"
#line 332 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1343,8 +1348,9 @@ finish_doctype:
break;
case 13:
#line 349 "htmlparse.y"
#line 353 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1367,8 +1373,9 @@ finish_script:
break;
case 14:
#line 370 "htmlparse.y"
#line 375 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -1391,8 +1398,9 @@ finish_style:
break;
case 15:
#line 391 "htmlparse.y"
#line 397 "htmlparse.y"
{
/* $1 is a PyString */
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1418,7 +1426,7 @@ finish_characters:
}
/* Line 999 of yacc.c. */
#line 1422 "htmlparse.c"
#line 1430 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1612,7 +1620,7 @@ yyreturn:
}
#line 413 "htmlparse.y"
#line 420 "htmlparse.y"
/* disable python memory interface */
@ -1627,7 +1635,6 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
{
return NULL;
}
self->handler = NULL;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
@ -1894,7 +1901,7 @@ static PyMemberDef parser_members[] = {
};
static PyMethodDef parser_methods[] = {
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},

View file

@ -1,7 +1,5 @@
/* the beginning */
%{
/* SAX parser, optimized for WebCleaner
Copyright (C) 2000-2004 Bastian Kleineidam
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -76,7 +74,6 @@ static PyObject* resolve_entities;
if (b==NULL) { Py_DECREF(self); return NULL; } \
(b)[0] = '\0'
/* call error handler if error object is not NULL */
#define CHECK_ERROR(ud, label) \
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
callback = PyObject_GetAttrString(ud->handler, "error"); \
@ -157,7 +154,8 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
}
| T_ELEMENT_START
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -191,7 +189,8 @@ finish_start:
}
| T_ELEMENT_START_END
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -220,7 +219,7 @@ finish_start:
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_start);
CHECK_ERROR(ud, finish_start_end);
finish_start_end:
Py_XDECREF(ud->error);
ud->error = NULL;
@ -237,6 +236,7 @@ finish_start_end:
}
| T_ELEMENT_END
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -267,6 +267,7 @@ finish_end:
}
| T_COMMENT
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -287,6 +288,7 @@ finish_comment:
}
| T_PI
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -307,6 +309,7 @@ finish_pi:
}
| T_CDATA
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -327,6 +330,7 @@ finish_cdata:
}
| T_DOCTYPE
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -347,6 +351,7 @@ finish_doctype:
}
| T_SCRIPT
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -368,6 +373,7 @@ finish_script:
}
| T_STYLE
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
@ -389,6 +395,7 @@ finish_style:
}
| T_TEXT
{
/* $1 is a PyString */
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -424,7 +431,6 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
{
return NULL;
}
self->handler = NULL;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
@ -691,7 +697,7 @@ static PyMemberDef parser_members[] = {
};
static PyMethodDef parser_methods[] = {
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},

View file

@ -1,5 +1,4 @@
/*
Copyright (C) 2000-2004 Bastian Kleineidam
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -36,7 +35,7 @@
/* user_data type for SAX calls */
typedef struct {
/* the Python SAX class instance to issue callbacks */
/* the Python SAX object to issue callbacks */
PyObject* handler;
/* Buffer to store still-to-be-scanned characters. After recognizing
* a complete syntax element, all data up to bufpos will be removed.