mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-01 03:24:43 +00:00
parser added
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@615 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
02d4f9135e
commit
bd1e7c158a
7 changed files with 1615 additions and 0 deletions
5
linkcheck/parser/.cvsignore
Normal file
5
linkcheck/parser/.cvsignore
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
*.pyc
|
||||
*.so
|
||||
*.pyo
|
||||
*.output
|
||||
*.o
|
||||
22
linkcheck/parser/Makefile
Normal file
22
linkcheck/parser/Makefile
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# use beta flex from ftp://ftp.uncg.edu/people/wlestes/ for reentrant
|
||||
# bison parser support
|
||||
FLEX=/home/calvin/src/flex-2.5.23/flex
|
||||
|
||||
all: htmllex.c htmlparse.c
|
||||
|
||||
%.o: %.c
|
||||
gcc -g -O -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/python2.1 -c $< -o $@
|
||||
|
||||
htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
|
||||
bison htmlparse.y
|
||||
|
||||
htmllex.l: htmlparse.h
|
||||
|
||||
htmllex.c: htmllex.l htmlsax.h
|
||||
$(FLEX) htmllex.l
|
||||
|
||||
test: testsax
|
||||
cat test.html | ./testsax
|
||||
|
||||
clean:
|
||||
rm -f htmlparse.c htmlparse.h htmllex.c *.o *.output
|
||||
17
linkcheck/parser/__init__.py
Normal file
17
linkcheck/parser/__init__.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
"""Fast HTML parser module written in C"""
|
||||
# Copyright (C) 2000-2002 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
781
linkcheck/parser/htmllex.l
Normal file
781
linkcheck/parser/htmllex.l
Normal file
|
|
@ -0,0 +1,781 @@
|
|||
/* Find recognizable tokens in (probably bad formatted) HTML streams.
|
||||
Unrecognizable character data is passed on as a TEXT token.
|
||||
*/
|
||||
|
||||
%{
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include "htmlsax.h"
|
||||
|
||||
#define YYSTYPE PyObject*
|
||||
#define YY_EXTRA_TYPE UserData*
|
||||
|
||||
/* reset buffer a to empty string */
|
||||
#define CLEAR_BUF(a) \
|
||||
a = PyMem_Resize(a, char, 1); \
|
||||
if (a==NULL) return T_ERROR; \
|
||||
a[0] = '\0'
|
||||
|
||||
/* make python string from tmp_buf and assign it to a */
|
||||
#define PYSTRING_TMP(a) \
|
||||
a = PyString_FromString(yyextra->tmp_buf); \
|
||||
if (a==NULL) return T_ERROR
|
||||
|
||||
/* set return value from tmp_buf */
|
||||
#define SETLVAL {\
|
||||
PyObject* s; \
|
||||
PYSTRING_TMP(s); \
|
||||
CLEAR_BUF(yyextra->tmp_buf); \
|
||||
*yylval = s; \
|
||||
}
|
||||
|
||||
/* append yytext to tmp_buf */
|
||||
#define APPEND_TO_TMP(n) {\
|
||||
int len = strlen(yyextra->tmp_buf); \
|
||||
yyextra->tmp_buf = PyMem_Resize(yyextra->tmp_buf, char, len+n+1); \
|
||||
strncat(yyextra->tmp_buf, yytext, n); \
|
||||
}
|
||||
|
||||
/* lowercase the tmp_buf */
|
||||
#define LOWER_TMP {\
|
||||
char* p = yyextra->tmp_buf; \
|
||||
while (*p) { *p = tolower(*p); p++; } \
|
||||
}
|
||||
|
||||
/* check for JavaScript or CSS tags; must be before SET_ATTR_LVAL */
|
||||
#define SCRIPT_CHECK \
|
||||
if (strcmp("script", PyString_AS_STRING(yyextra->tmp_tag))==0) \
|
||||
BEGIN(S_SCRIPT); \
|
||||
else if (strcmp("style", PyString_AS_STRING(yyextra->tmp_tag))==0) \
|
||||
BEGIN(S_STYLE); \
|
||||
else \
|
||||
BEGIN(INITIAL)
|
||||
|
||||
/* set return value from tag with attributes */
|
||||
#define SET_ATTR_LVAL \
|
||||
if (yyextra->tmp_tag==NULL || yyextra->tmp_attrs==NULL) { \
|
||||
PyErr_SetString(PyExc_TypeError, "tmp_tag or tmp_attrs is NULL"); \
|
||||
return T_ERROR; \
|
||||
} \
|
||||
*yylval = Py_BuildValue("(OO)", yyextra->tmp_tag, yyextra->tmp_attrs); \
|
||||
if ((*yylval)==NULL) return T_ERROR; \
|
||||
yyextra->tmp_tag = yyextra->tmp_attrs = NULL
|
||||
|
||||
/* store collected name as attribute in dictionary
|
||||
* tmp_attrname and tmp_attrval must be NULL
|
||||
*/
|
||||
#define FLUSH_ATTRS \
|
||||
if (strlen(yyextra->tmp_buf) > 0) { \
|
||||
PYSTRING_TMP(yyextra->tmp_attrname); \
|
||||
CLEAR_BUF(yyextra->tmp_buf); \
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None)==-1) return T_ERROR; \
|
||||
Py_DECREF(yyextra->tmp_attrname); \
|
||||
yyextra->tmp_attrname = NULL; \
|
||||
}
|
||||
|
||||
/* update the buffer position */
|
||||
#define UPDATE_BUFPOS yyextra->bufpos += yyleng
|
||||
/* return a token, adjusting the nextpos value */
|
||||
#define RETURN(tok) yyextra->nextpos = yyextra->bufpos; return tok
|
||||
|
||||
/* XXX todo */
|
||||
#define SET_ERROR(s)
|
||||
|
||||
/* use Pythons memory management */
|
||||
#define malloc PyMem_Malloc
|
||||
#define realloc PyMem_Realloc
|
||||
#define free PyMem_Free
|
||||
|
||||
#include "htmlparse.h"
|
||||
|
||||
/* Find out if and how we must quote the value as an HTML attribute.
|
||||
- quote if it contains white space or <>
|
||||
- quote with " if it contains '
|
||||
- quote with ' if it contains "
|
||||
|
||||
val is a Python String object
|
||||
*/
|
||||
static PyObject* quote_string (PyObject* val) {
|
||||
char* quote = NULL;
|
||||
int len = PyString_GET_SIZE(val);
|
||||
char* internal = PyString_AS_STRING(val);
|
||||
int i;
|
||||
PyObject* prefix;
|
||||
for (i=0; i<len; i++) {
|
||||
if (!quote && (isspace(internal[i]) ||
|
||||
internal[i]=='<' ||
|
||||
internal[i]=='>')) {
|
||||
quote = "\"";
|
||||
}
|
||||
else if (internal[i]=='\'') {
|
||||
quote = "\"";
|
||||
break;
|
||||
}
|
||||
else if (internal[i]=='"') {
|
||||
quote = "'";
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (quote==NULL) {
|
||||
return val;
|
||||
}
|
||||
/* quote suffix */
|
||||
if ((prefix = PyString_FromString(quote))==NULL) return NULL;
|
||||
PyString_Concat(&val, prefix);
|
||||
if (val==NULL) {
|
||||
Py_DECREF(prefix);
|
||||
return NULL;
|
||||
}
|
||||
/* quote prefix */
|
||||
PyString_ConcatAndDel(&prefix, val);
|
||||
if (prefix==NULL) {
|
||||
Py_DECREF(val);
|
||||
return NULL;
|
||||
}
|
||||
return prefix;
|
||||
}
|
||||
%}
|
||||
|
||||
%option 8bit outfile="htmllex.c"
|
||||
%option align full
|
||||
/* uncomment the next line for debugging */
|
||||
/*%option debug*/
|
||||
%option nounput nomain noyywrap noyymore noreject
|
||||
%option bison-bridge reentrant never-interactive
|
||||
%option warn
|
||||
|
||||
%x S_PI
|
||||
%x S_COMMENT
|
||||
%x S_DOCTYPE
|
||||
%x S_CDATA
|
||||
%x S_TAGSTART
|
||||
%x S_TAGEND
|
||||
%x S_SCRIPT
|
||||
%x S_STYLE
|
||||
%x S_ATTR1
|
||||
%x S_ATTR2
|
||||
%x S_ATTR3
|
||||
%x S_ATTR4
|
||||
%x S_ATTR5
|
||||
%x S_APOSSTRING
|
||||
%x S_STRING
|
||||
|
||||
RX_WHITE_SPACE [\n\r\ \t\b\012]
|
||||
RX_EQUAL =
|
||||
RX_NAME [a-zA-Z]([-a-zA-Z0-9_])*
|
||||
RX_DATA [-a-zA-Z0-9_]+
|
||||
|
||||
%%
|
||||
|
||||
/*********************** EOF ************************/
|
||||
<<EOF>> {
|
||||
/* wait for more data */
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
/*********************** COMMENT ************************/
|
||||
<INITIAL><!-- {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_COMMENT);
|
||||
}
|
||||
|
||||
<S_COMMENT>-*--> {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng-3);
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_COMMENT);
|
||||
}
|
||||
|
||||
<S_COMMENT>[^-]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>-+[^->]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
/*********************** DOCTYPE ************************/
|
||||
<INITIAL><![Dd][Oo][Cc][Tt][Yy][Pp][Ee] {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_DOCTYPE);
|
||||
}
|
||||
|
||||
<S_DOCTYPE>> {
|
||||
UPDATE_BUFPOS;
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_DOCTYPE);
|
||||
}
|
||||
|
||||
<S_DOCTYPE>[^>]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
/*********************** CDATA ************************/
|
||||
<INITIAL><!\[CDATA\[ {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_CDATA);
|
||||
}
|
||||
|
||||
<S_CDATA>\]*\]\]> {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng-3);
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_CDATA);
|
||||
}
|
||||
|
||||
<S_CDATA>[^\]]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_CDATA>\]+[^>\]]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_CDATA>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
/*********************** PI ************************/
|
||||
<INITIAL><\? {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_PI);
|
||||
}
|
||||
|
||||
<S_PI>\?*\?> {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng-2);
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_PI);
|
||||
}
|
||||
|
||||
<S_PI>[^?]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_PI>\?+[^?>]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_PI>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
|
||||
/*********************** TAGSTART ************************/
|
||||
<INITIAL><{RX_WHITE_SPACE}*/[A-Za-z] {
|
||||
UPDATE_BUFPOS;
|
||||
yyextra->tmp_attrs = PyDict_New();
|
||||
if (yyextra->tmp_attrs==NULL) return T_ERROR;
|
||||
BEGIN(S_TAGSTART);
|
||||
}
|
||||
|
||||
<S_TAGSTART>[^ \t\r\n\b\012/<>]+ {
|
||||
/* actually accept a lot of tag chars, which may be illegal,
|
||||
but we dont care, its the browsers job */
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_TAGSTART>{RX_WHITE_SPACE}+ {
|
||||
UPDATE_BUFPOS;
|
||||
LOWER_TMP;
|
||||
PYSTRING_TMP(yyextra->tmp_tag);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
BEGIN(S_ATTR1);
|
||||
}
|
||||
|
||||
<S_TAGSTART>\/> {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(INITIAL);
|
||||
if (!strlen(yyextra->tmp_buf)) {
|
||||
yyextra->tmp_buf = PyMem_Resize(yyextra->tmp_buf, char, 4);
|
||||
if (!yyextra->tmp_buf) {return T_ERROR; }
|
||||
strcpy(yyextra->tmp_buf, "</>");
|
||||
yyextra->tmp_attrs = NULL;
|
||||
SETLVAL;
|
||||
RETURN(T_TEXT);
|
||||
}
|
||||
LOWER_TMP;
|
||||
PYSTRING_TMP(yyextra->tmp_tag);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START_END);
|
||||
}
|
||||
|
||||
<S_TAGSTART>> {
|
||||
UPDATE_BUFPOS;
|
||||
if (!strlen(yyextra->tmp_buf)) {
|
||||
yyextra->tmp_buf = PyMem_Resize(yyextra->tmp_buf, char, 3);
|
||||
if (!yyextra->tmp_buf) {return T_ERROR; }
|
||||
strcpy(yyextra->tmp_buf, "<>");
|
||||
yyextra->tmp_attrs = NULL;
|
||||
SETLVAL;
|
||||
RETURN(T_TEXT);
|
||||
}
|
||||
LOWER_TMP;
|
||||
PYSTRING_TMP(yyextra->tmp_tag);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
SCRIPT_CHECK;
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START);
|
||||
}
|
||||
|
||||
<S_TAGSTART>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
/*********************** SCRIPT ************************/
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*> {
|
||||
UPDATE_BUFPOS;
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_SCRIPT);
|
||||
}
|
||||
|
||||
<S_SCRIPT>[^<]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
/* this is so shitty */
|
||||
<S_SCRIPT></[^/] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*/[^Ss] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss]/[^Cc] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc]/[^Rr] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr]/[^Ii] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii]/[^Pp] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp]/[^Tt] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT><\/{RX_WHITE_SPACE}*[Ss][Cc][Rr][Ii][Pp][Tt]{RX_WHITE_SPACE}*/[^>] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_SCRIPT>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
/*********************** STYLE ************************/
|
||||
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*> {
|
||||
UPDATE_BUFPOS;
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_STYLE);
|
||||
}
|
||||
|
||||
<S_STYLE>[^<]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
/* this is so shitty */
|
||||
<S_STYLE></[^/] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_STYLE><\/{RX_WHITE_SPACE}*/[^Ss] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss]/[^Tt] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt]/[^Yy] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy]/[^Ll] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll]/[^Ee] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_STYLE><\/{RX_WHITE_SPACE}*[Ss][Tt][Yy][Ll][Ee]{RX_WHITE_SPACE}*/[^>] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_STYLE>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
/*********************** ATTRS ************************/
|
||||
<S_ATTR1>{RX_NAME} {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
BEGIN(S_ATTR2);
|
||||
}
|
||||
|
||||
<S_ATTR1,S_ATTR2,S_ATTR3>\/> {
|
||||
UPDATE_BUFPOS;
|
||||
FLUSH_ATTRS;
|
||||
BEGIN(INITIAL);
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START_END);
|
||||
}
|
||||
|
||||
<S_ATTR1,S_ATTR2,S_ATTR3>> {
|
||||
UPDATE_BUFPOS;
|
||||
FLUSH_ATTRS;
|
||||
SCRIPT_CHECK;
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START);
|
||||
}
|
||||
|
||||
<S_ATTR2>{RX_DATA} {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_ATTR2>{RX_WHITE_SPACE}+ {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_ATTR3);
|
||||
}
|
||||
|
||||
<S_ATTR2,S_ATTR3>{RX_WHITE_SPACE}*{RX_EQUAL}{RX_WHITE_SPACE}* {
|
||||
UPDATE_BUFPOS;
|
||||
LOWER_TMP;
|
||||
PYSTRING_TMP(yyextra->tmp_attrname);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
BEGIN(S_ATTR4);
|
||||
}
|
||||
|
||||
<S_ATTR1,S_ATTR2>.|\n {
|
||||
/* this also skips whitespace! */
|
||||
UPDATE_BUFPOS;
|
||||
}
|
||||
|
||||
<S_ATTR3>{RX_NAME} {
|
||||
UPDATE_BUFPOS;
|
||||
LOWER_TMP;
|
||||
PYSTRING_TMP(yyextra->tmp_attrname);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
if (yyextra->tmp_attrval!=NULL) return T_ERROR;
|
||||
Py_INCREF(Py_None);
|
||||
yyextra->tmp_attrval = Py_None;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
BEGIN(S_ATTR2);
|
||||
}
|
||||
|
||||
<S_ATTR4>\" {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_STRING);
|
||||
}
|
||||
|
||||
|
||||
<S_ATTR4>\' {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_APOSSTRING);
|
||||
}
|
||||
|
||||
|
||||
<S_ATTR4>[^\012 \t\b\r\n>\'\"]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
BEGIN(S_ATTR5);
|
||||
}
|
||||
|
||||
<S_ATTR4>> {
|
||||
UPDATE_BUFPOS;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
|
||||
SCRIPT_CHECK;
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START);
|
||||
}
|
||||
|
||||
<S_ATTR4>\/> {
|
||||
UPDATE_BUFPOS;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
|
||||
BEGIN(INITIAL);
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START_END);
|
||||
}
|
||||
|
||||
<S_ATTR4>{RX_WHITE_SPACE}+ {
|
||||
UPDATE_BUFPOS;
|
||||
}
|
||||
|
||||
<S_ATTR5>[^\012 \t\b\r\n>]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_ATTR5>> {
|
||||
UPDATE_BUFPOS;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
|
||||
SCRIPT_CHECK;
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START);
|
||||
}
|
||||
|
||||
<S_ATTR5>\/> {
|
||||
UPDATE_BUFPOS;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
|
||||
BEGIN(INITIAL);
|
||||
SET_ATTR_LVAL;
|
||||
RETURN(T_ELEMENT_START_END);
|
||||
}
|
||||
|
||||
<S_ATTR5>{RX_WHITE_SPACE}+ {
|
||||
UPDATE_BUFPOS;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
|
||||
BEGIN(S_ATTR1);
|
||||
}
|
||||
|
||||
<S_APOSSTRING>\' {
|
||||
UPDATE_BUFPOS;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
if (!yyextra->tmp_attrval) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrname = yyextra->tmp_attrval = NULL;
|
||||
BEGIN(S_ATTR1);
|
||||
}
|
||||
|
||||
<S_APOSSTRING>[^']+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
|
||||
<S_STRING>\" {
|
||||
UPDATE_BUFPOS;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
CLEAR_BUF(yyextra->tmp_buf);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
if (!yyextra->tmp_attrval) { return T_ERROR; }
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = yyextra->tmp_attrname = NULL;
|
||||
BEGIN(S_ATTR1);
|
||||
}
|
||||
|
||||
<S_STRING>[^"]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
/*********************** TAGEND ************************/
|
||||
<INITIAL><{RX_WHITE_SPACE}*\/{RX_WHITE_SPACE}*/[A-Za-z] {
|
||||
UPDATE_BUFPOS;
|
||||
BEGIN(S_TAGEND);
|
||||
}
|
||||
|
||||
<S_TAGEND>[^<>\r\n \t\b\012]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_TAGEND>> {
|
||||
UPDATE_BUFPOS;
|
||||
LOWER_TMP;
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_ELEMENT_END);
|
||||
}
|
||||
|
||||
<S_TAGEND><{RX_WHITE_SPACE}* {
|
||||
UPDATE_BUFPOS;
|
||||
LOWER_TMP;
|
||||
SETLVAL;
|
||||
SET_ERROR("Missing > in end tag.");
|
||||
yyextra->tmp_attrs = PyDict_New();
|
||||
if (!yyextra->tmp_attrs) return T_ERROR;
|
||||
BEGIN(S_TAGSTART);
|
||||
RETURN(T_ELEMENT_END);
|
||||
}
|
||||
|
||||
<S_TAGEND>{RX_WHITE_SPACE}+ {
|
||||
/* delete whitespace in or around tag names */
|
||||
UPDATE_BUFPOS;
|
||||
}
|
||||
|
||||
/*********************** TEXT ************************/
|
||||
<INITIAL>[^<]+ {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
SETLVAL;
|
||||
RETURN(T_TEXT);
|
||||
}
|
||||
|
||||
<INITIAL><[^\012 \t\b\r\nA-Za-z!?/] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
SETLVAL;
|
||||
RETURN(T_TEXT);
|
||||
}
|
||||
|
||||
<INITIAL><{RX_WHITE_SPACE}+[^A-Za-z/] {
|
||||
UPDATE_BUFPOS;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
SETLVAL;
|
||||
RETURN(T_TEXT);
|
||||
}
|
||||
|
||||
<INITIAL>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
%%
|
||||
|
||||
#undef malloc
|
||||
#undef realloc
|
||||
#undef free
|
||||
|
||||
int htmllexInit (void** scanner, UserData* data) {
|
||||
yylex_init(scanner);
|
||||
yyset_extra(data, *scanner);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* prepare scanner for calls to yylex() */
|
||||
int htmllexStart (void* scanner, UserData* data, const char* s, int slen) {
|
||||
/* append s to data buffer and scan those bytes.
|
||||
As Flex does not distinguish between '\0' and EOF characters,
|
||||
we must replace '\0' with ' '. */
|
||||
int len = strlen(data->buf);
|
||||
int i;
|
||||
data->buf = PyMem_Resize(data->buf, char, len+slen+1);
|
||||
if (!data->buf) return -1;
|
||||
for (i=0; i<slen; i++) {
|
||||
if (s[i]=='\0')
|
||||
data->buf[len+i] = ' ';
|
||||
else
|
||||
data->buf[len+i] = s[i];
|
||||
}
|
||||
data->buf[len+slen] = '\0';
|
||||
if (len > data->bufpos) {
|
||||
int rewind = len - data->bufpos;
|
||||
slen += rewind;
|
||||
len -= rewind;
|
||||
}
|
||||
/* reset userdata */
|
||||
data->bufpos = len;
|
||||
data->exc_type = NULL;
|
||||
data->exc_val = NULL;
|
||||
data->exc_tb = NULL;
|
||||
/*fprintf(stderr, "SCANNING '%s'\n", data->buf+len);*/
|
||||
data->lexbuf = yy_scan_bytes(data->buf+len, slen, scanner);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* delete scanned buffer data */
|
||||
int htmllexStop (void* scanner, UserData* data) {
|
||||
yy_delete_buffer(data->lexbuf, scanner);
|
||||
if (data->nextpos > 0) {
|
||||
int len = strlen(data->buf);
|
||||
int i, j;
|
||||
for (i=data->nextpos,j=0; i<len; i++,j++) {
|
||||
data->buf[j] = data->buf[i];
|
||||
}
|
||||
data->buf[j] = '\0';
|
||||
data->buf = PyMem_Resize(data->buf, char, len-data->nextpos+1);
|
||||
data->bufpos -= data->nextpos;
|
||||
data->nextpos = 0;
|
||||
if (!data->buf) return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int htmllexDestroy (void* scanner) {
|
||||
yylex_destroy(scanner);
|
||||
return 0;
|
||||
}
|
||||
101
linkcheck/parser/htmllib.py
Normal file
101
linkcheck/parser/htmllib.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
"""A parser for HTML"""
|
||||
# Copyright (C) 2000,2001 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
import sys
|
||||
try:
|
||||
import htmlsax
|
||||
except ImportError:
|
||||
sys.stderr.write("""Could not import the `htmlsax' parser module.""")
|
||||
sys.exit(1)
|
||||
|
||||
class HtmlParser:
|
||||
"""Use an internal C SAX parser. We do not define any callbacks
|
||||
here for compatibility. Currently recognized callbacks are:
|
||||
comment(data): <!--data-->
|
||||
startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>
|
||||
endElement(tag): </tag>
|
||||
doctype(data): <!DOCTYPE data?>
|
||||
pi(name, data=None): <?name data?>
|
||||
cdata(data): <![CDATA[data]]>
|
||||
characters(data): data
|
||||
|
||||
additionally, there are error and warning callbacks:
|
||||
error(msg)
|
||||
warning(msg)
|
||||
fatalError(msg)
|
||||
"""
|
||||
def __init__ (self):
|
||||
"""initialize the internal parser"""
|
||||
self.parser = htmlsax.parser(self)
|
||||
|
||||
def feed (self, data):
|
||||
"""feed some data to the parser"""
|
||||
self.parser.feed(data)
|
||||
|
||||
def flush (self):
|
||||
"""flush all data"""
|
||||
self.parser.flush()
|
||||
|
||||
def reset (self):
|
||||
"""reset the parser (without flushing)"""
|
||||
self.parser.reset()
|
||||
|
||||
|
||||
class HtmlPrinter(HtmlParser):
|
||||
"""handles all functions by printing the function name and
|
||||
attributes"""
|
||||
def __getattr__ (self, name):
|
||||
self.mem = name
|
||||
return self._print
|
||||
|
||||
def _print (self, *attrs):
|
||||
print self.mem, attrs
|
||||
|
||||
|
||||
def _test():
|
||||
p = HtmlPrinter()
|
||||
p.feed("<hTml>")
|
||||
p.feed("<a href>")
|
||||
p.feed("<a href=''>")
|
||||
p.feed('<a href="">')
|
||||
p.feed("<a href='a'>")
|
||||
p.feed('<a href="a">')
|
||||
p.feed("<a href=a>")
|
||||
p.feed("<a href='\"'>")
|
||||
p.feed("<a href=\"'\">")
|
||||
p.feed("<a href=' '>")
|
||||
p.feed("<a href=a href=b>")
|
||||
p.feed("<a/>")
|
||||
p.feed("<a href/>")
|
||||
p.feed("<a href=a />")
|
||||
p.feed("</a>")
|
||||
p.feed("<?bla foo?>")
|
||||
p.feed("<?bla?>")
|
||||
p.feed("<!-- - comment -->")
|
||||
p.feed("<!---->")
|
||||
p.feed("<!DOCTYPE \"vla foo>")
|
||||
p.flush()
|
||||
|
||||
def _broken ():
|
||||
p = HtmlPrinter()
|
||||
p.feed("<img bo\\\nrder=0>")
|
||||
p.flush()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#_test()
|
||||
_broken()
|
||||
645
linkcheck/parser/htmlparse.y
Normal file
645
linkcheck/parser/htmlparse.y
Normal file
|
|
@ -0,0 +1,645 @@
|
|||
/* the beginning */
|
||||
%{
|
||||
/* SAX parser, optimized for WebCleaner */
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "htmlsax.h"
|
||||
|
||||
/* bison type definitions */
|
||||
#define YYSTYPE PyObject*
|
||||
#define YYPARSE_PARAM scanner
|
||||
#define YYLEX_PARAM scanner
|
||||
extern int yylex(YYSTYPE* yylvalp, void* scanner);
|
||||
extern int htmllexInit (void** scanner, UserData* data);
|
||||
extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
|
||||
extern int htmllexStop (void* scanner, UserData* data);
|
||||
extern int htmllexDestroy (void* scanner);
|
||||
extern void* yyget_extra(void*);
|
||||
#define YYERROR_VERBOSE 1
|
||||
/* standard error reporting, indicating an internal error */
|
||||
|
||||
static int yyerror (char* msg) {
|
||||
fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* macros for easier scanner state manipulation */
|
||||
|
||||
/* test whether tag does not need an HTML end tag */
|
||||
#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
|
||||
strcmp(tag, "base")==0 || \
|
||||
strcmp(tag, "basefont")==0 || \
|
||||
strcmp(tag, "br")==0 || \
|
||||
strcmp(tag, "col")==0 || \
|
||||
strcmp(tag, "frame")==0 || \
|
||||
strcmp(tag, "hr")==0 || \
|
||||
strcmp(tag, "img")==0 || \
|
||||
strcmp(tag, "input")==0 || \
|
||||
strcmp(tag, "isindex")==0 || \
|
||||
strcmp(tag, "link")==0 || \
|
||||
strcmp(tag, "meta")==0 || \
|
||||
strcmp(tag, "param")==0)
|
||||
|
||||
/* resize buf to an empty string */
|
||||
#define RESIZE_BUF(buf) \
|
||||
buf = PyMem_Resize(buf, char, 1); \
|
||||
if (buf==NULL) return NULL; \
|
||||
buf[0] = '\0'
|
||||
|
||||
/* set buf to an empty string */
|
||||
#define NEW_BUF(buf) \
|
||||
buf = PyMem_New(char, 1); \
|
||||
if (buf==NULL) return NULL; \
|
||||
buf[0] = '\0'
|
||||
|
||||
/* parser type definition */
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
UserData* userData;
|
||||
void* scanner;
|
||||
} parser_object;
|
||||
|
||||
staticforward PyTypeObject parser_type;
|
||||
|
||||
/* use Pythons memory management */
|
||||
#define malloc PyMem_Malloc
|
||||
#define realloc PyMem_Realloc
|
||||
#define free PyMem_Free
|
||||
|
||||
%}
|
||||
|
||||
/* parser options */
|
||||
/*%verbose*/
|
||||
/*%debug*/
|
||||
%defines
|
||||
%output="htmlparse.c"
|
||||
%pure_parser
|
||||
|
||||
%token T_WAIT
|
||||
%token T_ERROR
|
||||
%token T_TEXT
|
||||
%token T_ELEMENT_START
|
||||
%token T_ELEMENT_START_END
|
||||
%token T_ELEMENT_END
|
||||
%token T_SCRIPT
|
||||
%token T_STYLE
|
||||
%token T_PI
|
||||
%token T_COMMENT
|
||||
%token T_CDATA
|
||||
%token T_DOCTYPE
|
||||
|
||||
/* the finish_ labels are for error recovery */
|
||||
%%
|
||||
|
||||
elements: element {}
|
||||
| elements element {}
|
||||
;
|
||||
|
||||
element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
|
||||
| T_ERROR
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
| T_ELEMENT_START
|
||||
{
|
||||
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
PyObject* tag = PyTuple_GET_ITEM($1, 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
|
||||
int error = 0;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start; }
|
||||
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "startElement");
|
||||
if (!callback) { error=1; goto finish_start; }
|
||||
result = PyObject_CallFunction(callback, "OO", tag, attrs);
|
||||
if (!result) { error=1; goto finish_start; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_start; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_start; }
|
||||
}
|
||||
finish_start:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(tag);
|
||||
Py_XDECREF(attrs);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_ELEMENT_START_END
|
||||
{
|
||||
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
PyObject* tag = PyTuple_GET_ITEM($1, 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
|
||||
int error = 0;
|
||||
char* tagname;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "startElement");
|
||||
if (!callback) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "OO", tag, attrs);
|
||||
if (!result) { error=1; goto finish_start_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
tagname = PyString_AS_STRING(tag);
|
||||
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "endElement");
|
||||
if (callback==NULL) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "O", tag);
|
||||
if (result==NULL) { error=1; goto finish_start_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_start_end; }
|
||||
}
|
||||
finish_start_end:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(tag);
|
||||
Py_XDECREF(attrs);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_ELEMENT_END
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
char* tagname = PyString_AS_STRING($1);
|
||||
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "endElement");
|
||||
if (callback==NULL) { error=1; goto finish_end; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_end; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_end; }
|
||||
}
|
||||
finish_end:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_COMMENT
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
if (PyObject_HasAttrString(ud->handler, "comment")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "comment");
|
||||
if (callback==NULL) { error=1; goto finish_comment; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_comment; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_comment; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_comment; }
|
||||
}
|
||||
finish_comment:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_PI
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
if (PyObject_HasAttrString(ud->handler, "pi")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "pi");
|
||||
if (callback==NULL) { error=1; goto finish_pi; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_pi; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_pi; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_pi; }
|
||||
}
|
||||
finish_pi:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_CDATA
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
if (PyObject_HasAttrString(ud->handler, "cdata")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "cdata");
|
||||
if (callback==NULL) { error=1; goto finish_cdata; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_cdata; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_cdata; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_cdata; }
|
||||
}
|
||||
finish_cdata:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_DOCTYPE
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
if (PyObject_HasAttrString(ud->handler, "doctype")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "doctype");
|
||||
if (callback==NULL) { error=1; goto finish_doctype; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_doctype; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_doctype; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_doctype; }
|
||||
}
|
||||
finish_doctype:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_SCRIPT
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
if (PyObject_HasAttrString(ud->handler, "characters")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "characters");
|
||||
if (callback==NULL) { error=1; goto finish_script; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_script; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (PyObject_HasAttrString(ud->handler, "endElement")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "endElement");
|
||||
if (callback==NULL) { error=1; goto finish_script; }
|
||||
result = PyObject_CallFunction(callback, "s", "script");
|
||||
if (result==NULL) { error=1; goto finish_script; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_script; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_script; }
|
||||
}
|
||||
finish_script:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_STYLE
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
if (PyObject_HasAttrString(ud->handler, "characters")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "characters");
|
||||
if (callback==NULL) { error=1; goto finish_style; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_style; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (PyObject_HasAttrString(ud->handler, "endElement")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "endElement");
|
||||
if (callback==NULL) { error=1; goto finish_style; }
|
||||
result = PyObject_CallFunction(callback, "s", "style");
|
||||
if (result==NULL) { error=1; goto finish_style; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_style; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_style; }
|
||||
}
|
||||
finish_style:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
| T_TEXT
|
||||
{
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
if (PyObject_HasAttrString(ud->handler, "characters")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "characters");
|
||||
if (callback==NULL) { error=1; goto finish_characters; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_characters; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) {
|
||||
ud->error = NULL;
|
||||
callback = PyObject_GetAttrString(ud->handler, "error");
|
||||
if (!callback) { error=1; goto finish_characters; }
|
||||
result = PyObject_CallFunction(callback, "s", ud->error);
|
||||
if (!result) { error=1; goto finish_characters; }
|
||||
}
|
||||
finish_characters:
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
|
||||
YYABORT;
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
%%
|
||||
|
||||
/* disable python memory interface */
|
||||
#undef malloc
|
||||
#undef realloc
|
||||
#undef free
|
||||
|
||||
/* create parser */
|
||||
static PyObject* htmlsax_parser(PyObject* self, PyObject* args) {
|
||||
PyObject* handler;
|
||||
parser_object* p;
|
||||
if (!PyArg_ParseTuple(args, "O", &handler)) {
|
||||
PyErr_SetString(PyExc_TypeError, "SAX2 handler object arg required");
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(handler);
|
||||
if (!(p=PyObject_NEW(parser_object, &parser_type))) {
|
||||
PyErr_SetString(PyExc_TypeError, "Allocating parser object failed");
|
||||
return NULL;
|
||||
}
|
||||
/* reset userData */
|
||||
p->userData = PyMem_New(UserData, sizeof(UserData));
|
||||
p->userData->handler = handler;
|
||||
NEW_BUF(p->userData->buf);
|
||||
p->userData->nextpos = 0;
|
||||
p->userData->bufpos = 0;
|
||||
NEW_BUF(p->userData->tmp_buf);
|
||||
p->userData->tmp_tag = p->userData->tmp_attrname =
|
||||
p->userData->tmp_attrval = p->userData->tmp_attrs =
|
||||
p->userData->lexbuf = NULL;
|
||||
p->userData->exc_type = NULL;
|
||||
p->userData->exc_val = NULL;
|
||||
p->userData->exc_tb = NULL;
|
||||
p->userData->error = NULL;
|
||||
p->scanner = NULL;
|
||||
htmllexInit(&(p->scanner), p->userData);
|
||||
return (PyObject*) p;
|
||||
}
|
||||
|
||||
|
||||
static void parser_dealloc(parser_object* self) {
|
||||
htmllexDestroy(self->scanner);
|
||||
Py_DECREF(self->userData->handler);
|
||||
PyMem_Del(self->userData->buf);
|
||||
PyMem_Del(self->userData->tmp_buf);
|
||||
PyMem_Del(self->userData);
|
||||
PyMem_DEL(self);
|
||||
}
|
||||
|
||||
|
||||
static PyObject* parser_flush(parser_object* self, PyObject* args) {
|
||||
/* flush parser buffers */
|
||||
int res=0;
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
if (strlen(self->userData->buf)) {
|
||||
PyObject* s = PyString_FromString(self->userData->buf);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
if (s==NULL) return NULL;
|
||||
if (PyObject_HasAttrString(self->userData->handler, "characters")==1) {
|
||||
callback = PyObject_GetAttrString(self->userData->handler, "characters");
|
||||
if (callback==NULL) return NULL;
|
||||
result = PyObject_CallFunction(callback, "O", s);
|
||||
if (result==NULL) return NULL;
|
||||
}
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
Py_DECREF(s);
|
||||
/* reset buffer */
|
||||
RESIZE_BUF(self->userData->buf);
|
||||
self->userData->bufpos = 0;
|
||||
}
|
||||
RESIZE_BUF(self->userData->tmp_buf);
|
||||
self->userData->tmp_tag = self->userData->tmp_attrs =
|
||||
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
|
||||
return Py_BuildValue("i", res);
|
||||
}
|
||||
|
||||
|
||||
/* feed a chunk of data to the parser */
|
||||
static PyObject* parser_feed(parser_object* self, PyObject* args) {
|
||||
/* set up the parse string */
|
||||
int slen = 0;
|
||||
char* s = NULL;
|
||||
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
|
||||
PyErr_SetString(PyExc_TypeError, "string arg required");
|
||||
return NULL;
|
||||
}
|
||||
/* parse */
|
||||
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
|
||||
return NULL;
|
||||
}
|
||||
if (yyparse(self->scanner)!=0) {
|
||||
if (self->userData->exc_type!=NULL) {
|
||||
/* note: we give away these objects, so dont decref */
|
||||
PyErr_Restore(self->userData->exc_type,
|
||||
self->userData->exc_val,
|
||||
self->userData->exc_tb);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
if (htmllexStop(self->scanner, self->userData)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(Py_None);
|
||||
return Py_None;
|
||||
}
|
||||
|
||||
|
||||
/* reset the parser. This will erase all buffered data! */
|
||||
static PyObject* parser_reset(parser_object* self, PyObject* args) {
|
||||
if (!PyArg_ParseTuple(args, "")) {
|
||||
PyErr_SetString(PyExc_TypeError, "no args required");
|
||||
return NULL;
|
||||
}
|
||||
if (htmllexDestroy(self->scanner)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
|
||||
return NULL;
|
||||
}
|
||||
/* reset buffer */
|
||||
RESIZE_BUF(self->userData->buf);
|
||||
RESIZE_BUF(self->userData->tmp_buf);
|
||||
self->userData->bufpos = 0;
|
||||
self->userData->tmp_tag = self->userData->tmp_attrs =
|
||||
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0) {
|
||||
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(Py_None);
|
||||
return Py_None;
|
||||
}
|
||||
|
||||
|
||||
/* type interface */
|
||||
static PyMethodDef parser_methods[] = {
|
||||
/* incremental parsing */
|
||||
{"feed", (PyCFunction) parser_feed, METH_VARARGS},
|
||||
/* reset the parser (no flushing) */
|
||||
{"reset", (PyCFunction) parser_reset, METH_VARARGS},
|
||||
/* flush the parser buffers */
|
||||
{"flush", (PyCFunction) parser_flush, METH_VARARGS},
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
|
||||
static PyObject* parser_getattr(parser_object* self, char* name) {
|
||||
return Py_FindMethod(parser_methods, (PyObject*) self, name);
|
||||
}
|
||||
|
||||
|
||||
statichere PyTypeObject parser_type = {
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, /* ob_size */
|
||||
"parser", /* tp_name */
|
||||
sizeof(parser_object), /* tp_size */
|
||||
0, /* tp_itemsize */
|
||||
/* methods */
|
||||
(destructor)parser_dealloc, /* tp_dealloc */
|
||||
0, /* tp_print */
|
||||
(getattrfunc)parser_getattr, /* tp_getattr */
|
||||
0 /* tp_setattr */
|
||||
};
|
||||
|
||||
|
||||
/* python module interface */
|
||||
static PyMethodDef htmlsax_methods[] = {
|
||||
{"parser", htmlsax_parser, METH_VARARGS},
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
|
||||
/* initialization of the htmlsaxhtmlop module */
|
||||
void inithtmlsax(void) {
|
||||
Py_InitModule("htmlsax", htmlsax_methods);
|
||||
/*yydebug = 1;*/
|
||||
}
|
||||
44
linkcheck/parser/htmlsax.h
Normal file
44
linkcheck/parser/htmlsax.h
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
#ifndef HTMLSAX_H
|
||||
#define HTMLSAX_H
|
||||
|
||||
#include "Python.h"
|
||||
|
||||
/* require Python >= 2.0 */
|
||||
#ifndef PY_VERSION_HEX
|
||||
#error please install Python >= 2.0
|
||||
#endif
|
||||
|
||||
#if PY_VERSION_HEX < 0x02000000
|
||||
#error please install Python >= 2.0
|
||||
#endif
|
||||
|
||||
/* user_data type for SAX calls */
|
||||
typedef struct {
|
||||
/* the Python SAX class instance to issue callbacks */
|
||||
PyObject* handler;
|
||||
/* Buffer to store still-to-be-scanned characters. After recognizing
|
||||
* a complete syntax element, all data up to bufpos will be removed.
|
||||
* Before scanning you should append new data to this buffer.
|
||||
*/
|
||||
char* buf;
|
||||
/* current position in the buffer counting from zero */
|
||||
int bufpos;
|
||||
/* current position of next syntax element */
|
||||
int nextpos;
|
||||
/* temporary vars */
|
||||
void* lexbuf;
|
||||
char* tmp_buf;
|
||||
PyObject* tmp_tag;
|
||||
PyObject* tmp_attrname;
|
||||
PyObject* tmp_attrval;
|
||||
PyObject* tmp_attrs;
|
||||
/* stored Python exception (if error occurred in scanner) */
|
||||
PyObject* exc_type;
|
||||
PyObject* exc_val;
|
||||
PyObject* exc_tb;
|
||||
/* error string */
|
||||
char* error;
|
||||
} UserData;
|
||||
extern char* stpcpy(char* src, const char* dest);
|
||||
|
||||
#endif
|
||||
Loading…
Reference in a new issue