git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1353 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-07-06 20:34:00 +00:00
parent c6ba035b2b
commit 3bbfac47c7
9 changed files with 0 additions and 2285 deletions

View file

@ -1,2 +0,0 @@
*.so
*.output

View file

@ -1,27 +0,0 @@
# parser needs flex >= 2.5.xx from http://lex.sf.net/
# for reentrant bison parser support!
FLEX=flex
PYVER=2.3
PYTHON=python$(PYVER)
all: htmllex.c htmlparse.c
%.o: %.c
gcc -g -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
bison htmlparse.y
htmllex.l: htmlparse.h
htmllex.c: htmllex.l htmlsax.h
$(FLEX) htmllex.l
test: testsax
cat test.html | ./testsax
clean:
rm -f htmlparse.c htmlparse.h htmllex.c *.o *.output
splint:
splint -initallelements +posixlib -I/usr/include/linux -I. -I/usr/include/$(PYTHON) htmllex.c | less

View file

@ -1,118 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""Fast HTML parser module written in C with the following features:
1. Reentrant
As soon as any HTML string data is available, we try to feed it
to the HTML parser. This means that the parser has to scan possible
incomplete data, recognizing as much as it can. Incomplete trailing
data is saved for subsequent calls (or it is just flushed away with the
flush() function).
A reset() brings the parser back to its initial state, throwing away all
buffered data.
2. Coping with HTML syntax errors
The parser recognizes as much as it can and passes the rest
of the data as TEXT tokens.
The scanner only passes complete recognized HTML syntax elements to
the parser. Invalid syntax elements are passed as TEXT. This way we do
not need the bison error recovery.
Incomplete data is rescanned the next time the parser calls yylex() or
when it is being flush()ed.
The following syntax errors will be recognized correctly:
a) missing quotes around attribute values
b) "</...>" end tags in script modus
c) missing ">" in tags
d) invalid tag names
e) invalid characters inside tags or tag attributes
Additionally the parser has the following features:
a) NULL bytes are changed into spaces
b) <!-- ... --> inside a <script> or <style> are not treated as
comments, so you can safely turn on the comment delete rule
3. Speed
The FLEX code has options to generate a large but fast scanner.
The parser ignores forbidden or unnecessary HTML end tags.
The parser converts tag and attribute names to lower case for easier
matching.
The parser quotes all attribute values with minimal necessity (this is
not standard compliant, but who cares when the browsers understand it).
The Python memory management interface is being used.
"""
__version__ = "$Revision$"[11:-2]
__date__ = "$Date$"[7:-2]
import re
import htmlentitydefs
def _resolve_entity (mo):
"""resolve one &#XXX; entity"""
# convert to number
ent = mo.group()
num = mo.group("num")
if ent.startswith('&#x'):
radix = 16
else:
radix = 10
num = int(num, radix)
# check 7-bit ASCII char range
if 0<=num<=127:
return chr(num)
# not in range
return ent
def resolve_entities (s):
"""resolve entities in 7-bit ASCII range to eliminate obfuscation"""
return re.sub(r'(?i)&#x?(?P<num>\d+);', _resolve_entity, s)
entities = htmlentitydefs.entitydefs.items()
UnHtmlTable = [("&"+x[0]+";", x[1]) for x in entities]
# order matters!
UnHtmlTable.sort()
UnHtmlTable.reverse()
def applyTable (table, s):
"apply a table of replacement pairs to str"
for mapping in table:
s = s.replace(mapping[0], mapping[1])
return s
def resolve_html_entities (s):
"""resolve html entites in s and return result"""
return applyTable(UnHtmlTable, s)
def strip_quotes (s):
"""remove possible double or single quotes"""
if (s.startswith("'") and s.endswith("'")) or \
(s.startswith('"') and s.endswith('"')):
return s[1:-1]
return s

File diff suppressed because it is too large Load diff

View file

@ -1,115 +0,0 @@
# -*- coding: iso-8859-1 -*-
"""Default handler classes"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
__version__ = "$Revision$"[11:-2]
__date__ = "$Date$"[7:-2]
import sys
class HtmlPrinter (object):
"""handles all functions by printing the function name and attributes"""
def __init__ (self, fd=sys.stdout):
"""write to given file descriptor"""
self.fd = fd
def _print (self, *attrs):
print >> self.fd, self.mem, attrs
def _errorfun (self, msg, name):
"""print msg to stderr with name prefix"""
print >> sys.stderr, name, msg
def error (self, msg):
"""signal a filter/parser error"""
self._errorfun(msg, "error:")
def warning (self, msg):
"""signal a filter/parser warning"""
self._errorfun(msg, "warning:")
def fatalError (self, msg):
"""signal a fatal filter/parser error"""
self._errorfun(msg, "fatal error:")
def __getattr__ (self, name):
"""remember the func name"""
self.mem = name
return self._print
class HtmlPrettyPrinter (object):
"""Print out all parsed HTML data"""
def __init__ (self, fd=sys.stdout):
"""write to given file descriptor"""
self.fd = fd
def comment (self, data):
"""print comment"""
self.fd.write("<!--%s-->" % data)
def startElement (self, tag, attrs):
"""print start element"""
self.fd.write("<%s"%tag.replace("/", ""))
for key, val in attrs.iteritems():
if val is None:
self.fd.write(" %s"%key)
else:
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
self.fd.write(">")
def endElement (self, tag):
"""print end element"""
self.fd.write("</%s>" % tag)
def doctype (self, data):
"""print document type"""
self.fd.write("<!DOCTYPE%s>" % data)
def pi (self, data):
"""print pi"""
self.fd.write("<?%s?>" % data)
def cdata (self, data):
"""print cdata"""
self.fd.write("<![CDATA[%s]]>"%data)
def characters (self, data):
"""print characters"""
self.fd.write(data)
def quote_attrval (val):
"""quote a HTML attribute to be able to wrap it in double quotes"""
return val.replace('"', '&quot;')

View file

@ -1,840 +0,0 @@
%{
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* Python module definition of a SAX html parser */
#include "htmlsax.h"
#include "structmember.h"
#include <string.h>
#include <stdio.h>
/* bison type definitions */
#define YYSTYPE PyObject*
#define YYPARSE_PARAM scanner
#define YYLEX_PARAM scanner
/* extern functions found in htmllex.l */
extern int yylex(YYSTYPE* yylvalp, void* scanner);
extern int htmllexInit (void** scanner, UserData* data);
extern int htmllexDebug (void** scanner, int debug);
extern int htmllexStart (void* scanner, UserData* data, const char* s, int slen);
extern int htmllexStop (void* scanner, UserData* data);
extern int htmllexDestroy (void* scanner);
extern void* yyget_extra(void*);
extern int yyget_lineno(void*);
#define YYERROR_VERBOSE 1
/* standard error reporting, indicating an internal error */
static int yyerror (char* msg) {
fprintf(stderr, "htmlsax: internal parse error: %s\n", msg);
return 0;
}
/* parser.resolve_entities */
static PyObject* resolve_entities;
static PyObject* list_dict;
/* macros for easier scanner state manipulation */
/* test whether tag does not need an HTML end tag */
#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
strcmp(tag, "base")==0 || \
strcmp(tag, "basefont")==0 || \
strcmp(tag, "br")==0 || \
strcmp(tag, "col")==0 || \
strcmp(tag, "frame")==0 || \
strcmp(tag, "hr")==0 || \
strcmp(tag, "img")==0 || \
strcmp(tag, "input")==0 || \
strcmp(tag, "isindex")==0 || \
strcmp(tag, "link")==0 || \
strcmp(tag, "meta")==0 || \
strcmp(tag, "param")==0)
/* clear buffer b, returning NULL on error */
#define CLEAR_BUF(b) \
b = PyMem_Resize(b, char, 1); \
if (b==NULL) return NULL; \
(b)[0] = '\0'
/* clear buffer b, returning NULL and decref self on error */
#define CLEAR_BUF_DECREF(self, b) \
b = PyMem_Resize(b, char, 1); \
if (b==NULL) { Py_DECREF(self); return NULL; } \
(b)[0] = '\0'
#define CHECK_ERROR(ud, label) \
if (ud->error && PyObject_HasAttrString(ud->handler, "error")==1) { \
callback = PyObject_GetAttrString(ud->handler, "error"); \
if (!callback) { error=1; goto label; } \
result = PyObject_CallFunction(callback, "O", ud->error); \
if (!result) { error=1; goto label; } \
}
/* generic callback macro */
#define CALLBACK(ud, attr, format, arg, label) \
if (PyObject_HasAttrString(ud->handler, attr)==1) { \
callback = PyObject_GetAttrString(ud->handler, attr); \
if (callback==NULL) { error=1; goto label; } \
result = PyObject_CallFunction(callback, format, arg); \
if (result==NULL) { error=1; goto label; } \
Py_DECREF(callback); \
Py_DECREF(result); \
callback=result=NULL; \
}
/* set old line and column */
#define SET_OLD_LINECOL \
ud->last_lineno = ud->lineno; \
ud->last_column = ud->column
/* parser type definition */
typedef struct {
PyObject_HEAD
PyObject* handler;
UserData* userData;
void* scanner;
} parser_object;
staticforward PyTypeObject parser_type;
/* use Pythons memory management */
#define malloc PyMem_Malloc
#define realloc PyMem_Realloc
#define free PyMem_Free
%}
/* parser options */
%verbose
%debug
%defines
%output="htmlparse.c"
%pure_parser
/* parser tokens */
%token T_WAIT
%token T_ERROR
%token T_TEXT
%token T_ELEMENT_START
%token T_ELEMENT_START_END
%token T_ELEMENT_END
%token T_SCRIPT
%token T_STYLE
%token T_PI
%token T_COMMENT
%token T_CDATA
%token T_DOCTYPE
/* the finish_ labels are for error recovery */
%%
elements: element {}
| elements element {}
;
element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
| T_ERROR
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
| T_ELEMENT_START
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
PyObject* tag = PyTuple_GET_ITEM($1, 0);
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
int error = 0;
if (!tag || !attrs) { error = 1; goto finish_start; }
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
callback = PyObject_GetAttrString(ud->handler, "startElement");
if (!callback) { error=1; goto finish_start; }
result = PyObject_CallFunction(callback, "OO", tag, attrs);
if (!result) { error=1; goto finish_start; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_start);
finish_start:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(tag);
Py_XDECREF(attrs);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_ELEMENT_START_END
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
PyObject* tag = PyTuple_GET_ITEM($1, 0);
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
int error = 0;
char* tagname;
if (!tag || !attrs) { error = 1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "startElement")==1) {
callback = PyObject_GetAttrString(ud->handler, "startElement");
if (!callback) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "OO", tag, attrs);
if (!result) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
tagname = PyString_AS_STRING(tag);
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
NO_HTML_END_TAG(tagname)) {
callback = PyObject_GetAttrString(ud->handler, "endElement");
if (callback==NULL) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "O", tag);
if (result==NULL) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_start_end);
finish_start_end:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(tag);
Py_XDECREF(attrs);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_ELEMENT_END
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
char* tagname = PyString_AS_STRING($1);
if (PyObject_HasAttrString(ud->handler, "endElement")==1 &&
NO_HTML_END_TAG(tagname)) {
callback = PyObject_GetAttrString(ud->handler, "endElement");
if (callback==NULL) { error=1; goto finish_end; }
result = PyObject_CallFunction(callback, "O", $1);
if (result==NULL) { error=1; goto finish_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
CHECK_ERROR(ud, finish_end);
finish_end:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_COMMENT
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "comment", "O", $1, finish_comment);
CHECK_ERROR(ud, finish_comment);
finish_comment:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_PI
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "pi", "O", $1, finish_pi);
CHECK_ERROR(ud, finish_pi);
finish_pi:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_CDATA
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "cdata", "O", $1, finish_cdata);
CHECK_ERROR(ud, finish_cdata);
finish_cdata:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_DOCTYPE
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "doctype", "O", $1, finish_doctype);
CHECK_ERROR(ud, finish_doctype);
finish_doctype:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_SCRIPT
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "characters", "O", $1, finish_script);
CALLBACK(ud, "endElement", "s", "script", finish_script);
CHECK_ERROR(ud, finish_script);
finish_script:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_STYLE
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "characters", "O", $1, finish_style);
CALLBACK(ud, "endElement", "s", "style", finish_style);
CHECK_ERROR(ud, finish_style);
finish_style:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
| T_TEXT
{
/* $1 is a PyString */
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
CALLBACK(ud, "characters", "O", $1, finish_characters);
CHECK_ERROR(ud, finish_characters);
finish_characters:
Py_XDECREF(ud->error);
ud->error = NULL;
Py_XDECREF(callback);
Py_XDECREF(result);
Py_DECREF($1);
if (error) {
PyErr_Fetch(&(ud->exc_type), &(ud->exc_val), &(ud->exc_tb));
YYABORT;
}
SET_OLD_LINECOL;
}
;
%%
/* disable python memory interface */
#undef malloc
#undef realloc
#undef free
/* create parser object */
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
parser_object* self;
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
{
return NULL;
}
Py_INCREF(Py_None);
self->handler = Py_None;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
if (self->userData == NULL)
{
Py_DECREF(self);
return NULL;
}
self->userData->handler = self->handler;
self->userData->buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->buf);
self->userData->nextpos = 0;
self->userData->bufpos = 0;
self->userData->pos = 0;
self->userData->column = 1;
self->userData->last_column = 1;
self->userData->lineno = 1;
self->userData->last_lineno = 1;
self->userData->tmp_buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->tmp_buf);
self->userData->tmp_tag = self->userData->tmp_attrname =
self->userData->tmp_attrval = self->userData->tmp_attrs =
self->userData->lexbuf = NULL;
self->userData->resolve_entities = resolve_entities;
self->userData->list_dict = list_dict;
self->userData->exc_type = NULL;
self->userData->exc_val = NULL;
self->userData->exc_tb = NULL;
self->userData->error = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0)
{
Py_DECREF(self);
return NULL;
}
return (PyObject*) self;
}
/* initialize parser object */
static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
PyObject* handler = NULL;
static char *kwlist[] = {"handler", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
return -1;
}
if (handler==NULL) {
return 0;
}
Py_DECREF(self->handler);
Py_INCREF(handler);
self->handler = handler;
self->userData->handler = self->handler;
return 0;
}
/* traverse all used subobjects participating in reference cycles */
static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
if (visit(self->handler, arg) < 0) {
return -1;
}
return 0;
}
/* clear all used subobjects participating in reference cycles */
static int parser_clear (parser_object* self) {
Py_XDECREF(self->handler);
self->handler = NULL;
self->userData->handler = NULL;
return 0;
}
/* free all allocated resources of parser object */
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
parser_clear(self);
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
self->ob_type->tp_free((PyObject*)self);
}
/* feed a chunk of data to the parser */
static PyObject* parser_feed (parser_object* self, PyObject* args) {
/* set up the parse string */
int slen = 0;
char* s = NULL;
if (!PyArg_ParseTuple(args, "t#", &s, &slen)) {
PyErr_SetString(PyExc_TypeError, "string arg required");
return NULL;
}
/* parse */
if (htmllexStart(self->scanner, self->userData, s, slen)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not start scanner");
return NULL;
}
if (yyparse(self->scanner)!=0) {
if (self->userData->exc_type!=NULL) {
/* note: we give away these objects, so don't decref */
PyErr_Restore(self->userData->exc_type,
self->userData->exc_val,
self->userData->exc_tb);
}
htmllexStop(self->scanner, self->userData);
return NULL;
}
if (htmllexStop(self->scanner, self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not stop scanner");
return NULL;
}
Py_RETURN_NONE;
}
/* flush all parser buffers */
static PyObject* parser_flush (parser_object* self, PyObject* args) {
int res = 0;
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
/* reset parser variables */
CLEAR_BUF(self->userData->tmp_buf);
Py_XDECREF(self->userData->tmp_tag);
Py_XDECREF(self->userData->tmp_attrs);
Py_XDECREF(self->userData->tmp_attrval);
Py_XDECREF(self->userData->tmp_attrname);
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->userData->bufpos = 0;
if (strlen(self->userData->buf)) {
/* XXX set line, col */
int error = 0;
PyObject* s = PyString_FromString(self->userData->buf);
PyObject* callback = NULL;
PyObject* result = NULL;
/* reset buffer */
CLEAR_BUF(self->userData->buf);
if (s==NULL) { error=1; goto finish_flush; }
if (PyObject_HasAttrString(self->handler, "characters")==1) {
callback = PyObject_GetAttrString(self->handler, "characters");
if (callback==NULL) { error=1; goto finish_flush; }
result = PyObject_CallFunction(callback, "O", s);
if (result==NULL) { error=1; goto finish_flush; }
}
finish_flush:
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(s);
if (error==1) {
return NULL;
}
}
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
return Py_BuildValue("i", res);
}
/* return the current parser line number */
static PyObject* parser_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->lineno);
}
/* return the last parser line number */
static PyObject* parser_last_lineno (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_lineno);
}
/* return the current parser column number */
static PyObject* parser_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->column);
}
/* return the last parser column number */
static PyObject* parser_last_column (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->last_column);
}
/* return the parser position in data stream */
static PyObject* parser_pos (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
return Py_BuildValue("i", self->userData->pos);
}
/* reset the parser. This will erase all buffered data! */
static PyObject* parser_reset (parser_object* self, PyObject* args) {
if (!PyArg_ParseTuple(args, "")) {
PyErr_SetString(PyExc_TypeError, "no args required");
return NULL;
}
if (htmllexDestroy(self->scanner)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not destroy scanner data");
return NULL;
}
/* reset buffer */
CLEAR_BUF(self->userData->buf);
CLEAR_BUF(self->userData->tmp_buf);
self->userData->bufpos =
self->userData->pos =
self->userData->nextpos = 0;
self->userData->column =
self->userData->last_column =
self->userData->lineno =
self->userData->last_lineno = 1;
self->userData->tmp_tag = self->userData->tmp_attrs =
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->scanner = NULL;
if (htmllexInit(&(self->scanner), self->userData)!=0) {
PyErr_SetString(PyExc_MemoryError, "could not initialize scanner data");
return NULL;
}
Py_RETURN_NONE;
}
/* set the debug level, if its >0, debugging is on, =0 means off */
static PyObject* parser_debug (parser_object* self, PyObject* args) {
int debug;
if (!PyArg_ParseTuple(args, "i", &debug)) {
return NULL;
}
yydebug = debug;
debug = htmllexDebug(&(self->scanner), debug);
return PyInt_FromLong((long)debug);
}
static PyObject* parser_gethandler (parser_object* self, void* closure) {
Py_INCREF(self->handler);
return self->handler;
}
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
return -1;
}
Py_DECREF(self->handler);
Py_INCREF(value);
self->handler = value;
self->userData->handler = self->handler;
return 0;
}
/* type interface */
static PyMemberDef parser_members[] = {
{NULL} /* Sentinel */
};
static PyGetSetDef parser_getset[] = {
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
"handler object", NULL},
{NULL} /* Sentinel */
};
static PyMethodDef parser_methods[] = {
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
{NULL} /* Sentinel */
};
static PyTypeObject parser_type = {
PyObject_HEAD_INIT(NULL)
0, /* ob_size */
"linkcheck.parser.htmlsax.parser", /* tp_name */
sizeof(parser_object), /* tp_size */
0, /* tp_itemsize */
/* methods */
(destructor)parser_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Py_TPFLAGS_HAVE_GC, /* tp_flags */
"HTML parser object", /* tp_doc */
(traverseproc)parser_traverse, /* tp_traverse */
(inquiry)parser_clear, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
parser_methods, /* tp_methods */
parser_members, /* tp_members */
parser_getset, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)parser_init, /* tp_init */
0, /* tp_alloc */
parser_new, /* tp_new */
};
/* python module interface
"Create a new HTML parser object with handler (which may be None).\n"
"\n"
"Used callbacks (they don't have to be defined) of a handler are:\n"
"comment(data): <!--data-->\n"
"startElement(tag, attrs): <tag {attr1:value1,attr2:value2,..}>\n"
"endElement(tag): </tag>\n"
"doctype(data): <!DOCTYPE data?>\n"
"pi(name, data=None): <?name data?>\n"
"cdata(data): <![CDATA[data]]>\n"
"characters(data): data\n"
"\n"
"Additionally, there are error and warning callbacks:\n"
"error(msg)\n"
"warning(msg)\n"
"fatalError(msg)\n"},
*/
static PyMethodDef htmlsax_methods[] = {
{NULL} /* Sentinel */
};
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
/* initialization of the htmlsax module */
PyMODINIT_FUNC inithtmlsax (void) {
PyObject* m;
if (PyType_Ready(&parser_type) < 0) {
return;
}
if ((m = Py_InitModule3("htmlsax", htmlsax_methods, "SAX HTML parser routines"))==NULL) {
return;
}
Py_INCREF(&parser_type);
if (PyModule_AddObject(m, "parser", (PyObject *)&parser_type)==-1) {
/* init error */
PyErr_Print();
}
if ((m = PyImport_ImportModule("linkcheck.parser"))==NULL) {
return;
}
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
return;
}
if ((m = PyImport_ImportModule("linkcheck.containers"))==NULL) {
return;
}
if ((list_dict = PyObject_GetAttrString(m, "ListDict"))==NULL) {
return;
}
}

View file

@ -1,83 +0,0 @@
/* Copyright (C) 2000-2004 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef HTMLSAX_H
#define HTMLSAX_H
#include "Python.h"
/* require Python >= 2.3 */
#ifndef PY_VERSION_HEX
#error please install Python >= 2.3
#endif
#if PY_VERSION_HEX < 0x02030000
#error please install Python >= 2.3
#endif
/* this will be in Python 2.4 */
#ifndef Py_RETURN_NONE
#define Py_RETURN_NONE do {Py_INCREF(Py_None); return Py_None;} while (0)
#endif
/* user_data type for SAX calls */
typedef struct {
/* the Python SAX object to issue callbacks */
PyObject* handler;
/* Buffer to store still-to-be-scanned characters. After recognizing
* a complete syntax element, all data up to bufpos will be removed.
* Before scanning you should append new data to this buffer.
*/
char* buf;
/* current position in the buffer counting from zero */
unsigned int bufpos;
/* current position of next syntax element */
unsigned int nextpos;
/* position in the stream of data already seen, counting from zero */
unsigned int pos;
/* line counter, counting from one */
unsigned int lineno;
/* last value of line counter */
unsigned int last_lineno;
/* column counter, counting from zero */
unsigned int column;
/* last value of column counter */
unsigned int last_column;
/* input buffer of lexer, must be deleted when the parsing stops */
void* lexbuf;
/* temporary character buffer */
char* tmp_buf;
/* temporary HTML start or end tag name */
PyObject* tmp_tag;
/* temporary HTML start tag attribute name */
PyObject* tmp_attrname;
/* temporary HTML start tag attribute value */
PyObject* tmp_attrval;
/* temporary HTML start tag attribute list (a SortedDict) */
PyObject* tmp_attrs;
/* parser.resolve_entities */
PyObject* resolve_entities;
/* parser.SortedDict */
PyObject* list_dict;
/* stored Python exception (if error occurred in scanner) */
PyObject* exc_type;
PyObject* exc_val;
PyObject* exc_tb;
/* error string */
PyObject* error;
} UserData;
#endif

View file

@ -1,54 +0,0 @@
/*
* linux/lib/string.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <string.h>
#if !defined(HAVE_STRLCPY)
/**
* strlcpy - Copy a %NUL terminated string into a sized buffer
* @dst: Where to copy the string to
* @src: Where to copy the string from
* @size: size of destination buffer
*
* Compatible with *BSD: the result is always a valid
* NUL-terminated string that fits in the buffer (unless,
* of course, the buffer size is zero). It does not pad
* out the result like strncpy() does.
*/
size_t strlcpy (char *dst, const char *src, size_t count)
{
size_t ret = strlen(src);
if (count) {
size_t len = (ret >= count) ? count-1 : ret;
memcpy(dst, src, len);
dst[len] = '\0';
}
return ret;
}
#endif /* !HAVE_STRLCPY */
#if !defined(HAVE_STRLCAT)
/**
* strlcat - Append a length-limited, %NUL-terminated string to another
* @dst: The string to be appended to
* @src: The string to append to it
* @size: The size of the destination buffer.
*/
size_t strlcat (char *dest, const char *src, size_t count)
{
size_t dsize = strlen(dest);
size_t len = strlen(src);
size_t res = dsize + len;
dest += dsize;
count -= dsize;
if (len >= count)
len = count-1;
memcpy(dest, src, len);
dest[len] = 0;
return res;
}
#endif /* !HAVE_STRLCAT */

View file

@ -1,14 +0,0 @@
/*
* linux/lib/string.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#if !defined(HAVE_STRLCPY)
size_t strlcpy(char *dst, const char *src, size_t size);
#endif /* !HAVE_STRLCPY */
#if !defined(HAVE_STRLCAT)
size_t strlcat(char *dst, const char *src, size_t size);
#endif /* !HAVE_STRLCAT */