git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1296 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-04-04 08:30:48 +00:00
parent 672e118d9b
commit 93253954a8
4 changed files with 1744 additions and 1756 deletions

File diff suppressed because it is too large Load diff

View file

@ -20,19 +20,16 @@ __version__ = "$Revision$"[11:-2]
__date__ = "$Date$"[7:-2]
import sys
try:
import htmlsax
except ImportError, msg:
exctype, value = sys.exc_info()[:2]
print >>sys.stderr, "Could not import the parser module `htmlsax':", value
print >>sys.stderr, "Please check your installation of LinkChecker."
sys.exit(1)
class HtmlPrinter (object):
"""handles all functions by printing the function name and attributes"""
def __init__ (self, fd=sys.stdout):
self.fd = fd
def _print (self, *attrs):
print self.mem, attrs
print >> self.fd, self.mem, attrs
def _errorfun (self, msg, name):
@ -61,43 +58,46 @@ class HtmlPrinter (object):
return self._print
class HtmlPrettyPrinter (object):
def __init__ (self, fd=sys.stdout):
self.fd = fd
def comment (self, data):
self.fd.write("<!--%s-->" % data)
def startElement (self, tag, attrs):
self.fd.write("<%s"%tag.replace("/", ""))
for key, val in attrs.iteritems():
if val is None:
self.fd.write(" %s"%key)
else:
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
self.fd.write(">")
def endElement (self, tag):
self.fd.write("</%s>" % tag)
def doctype (self, data):
self.fd.write("<!DOCTYPE%s>" % data)
def pi (self, data):
self.fd.write("<?%s?>" % data)
def cdata (self, data):
self.fd.write("<![CDATA[%s]]>"%data)
def characters (self, data):
self.fd.write(data)
def quote_attrval (val):
"""quote a HTML attribute to be able to wrap it in double quotes"""
return val.replace('"', '&quot;')
def _test():
p = htmlsax.parser(HtmlPrinter())
p.feed("<hTml>")
p.feed("<a href>")
p.feed("<a href=''>")
p.feed('<a href="">')
p.feed("<a href='a'>")
p.feed('<a href="a">')
p.feed("<a href=a>")
p.feed("<a href='\"'>")
p.feed("<a href=\"'\">")
p.feed("<a href=' '>")
p.feed("<a href=a href=b>")
p.feed("<a/>")
p.feed("<a href/>")
p.feed("<a href=a />")
p.feed("</a>")
p.feed("<?bla foo?>")
p.feed("<?bla?>")
p.feed("<!-- - comment -->")
p.feed("<!---->")
p.feed("<!DOCTYPE \"vla foo>")
p.flush()
def _broken ():
p = htmlsax.parser(HtmlPrinter())
# turn on debugging
p.debug(1)
p.feed("""<base href="http://www.msnbc.com/news/">""")
p.flush()
if __name__ == '__main__':
#_test()
_broken()

View file

@ -131,6 +131,7 @@ static int yyerror (char* msg) {
/* parser.resolve_entities */
static PyObject* resolve_entities;
static PyObject* sorted_dict;
/* macros for easier scanner state manipulation */
@ -229,7 +230,7 @@ typedef int YYSTYPE;
/* Line 214 of yacc.c. */
#line 233 "htmlparse.c"
#line 234 "htmlparse.c"
#if ! defined (yyoverflow) || YYERROR_VERBOSE
@ -399,8 +400,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short yyrline[] =
{
0, 143, 143, 144, 147, 148, 155, 190, 237, 268,
289, 310, 331, 352, 374, 396
0, 144, 144, 145, 148, 149, 156, 191, 238, 269,
290, 311, 332, 353, 375, 397
};
#endif
@ -1105,22 +1106,22 @@ yyreduce:
switch (yyn)
{
case 2:
#line 143 "htmlparse.y"
{;}
break;
case 3:
#line 144 "htmlparse.y"
{;}
break;
case 3:
#line 145 "htmlparse.y"
{;}
break;
case 4:
#line 147 "htmlparse.y"
#line 148 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
break;
case 5:
#line 149 "htmlparse.y"
#line 150 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1130,7 +1131,7 @@ yyreduce:
break;
case 6:
#line 156 "htmlparse.y"
#line 157 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
@ -1168,7 +1169,7 @@ finish_start:
break;
case 7:
#line 191 "htmlparse.y"
#line 192 "htmlparse.y"
{
/* $1 is a PyTuple (<tag>, <attrs>)
<tag> is a PyString, <attrs> is a PyDict */
@ -1218,7 +1219,7 @@ finish_start_end:
break;
case 8:
#line 238 "htmlparse.y"
#line 239 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
@ -1252,7 +1253,7 @@ finish_end:
break;
case 9:
#line 269 "htmlparse.y"
#line 270 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
@ -1276,7 +1277,7 @@ finish_comment:
break;
case 10:
#line 290 "htmlparse.y"
#line 291 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
@ -1300,7 +1301,7 @@ finish_pi:
break;
case 11:
#line 311 "htmlparse.y"
#line 312 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
@ -1324,7 +1325,7 @@ finish_cdata:
break;
case 12:
#line 332 "htmlparse.y"
#line 333 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
@ -1348,7 +1349,7 @@ finish_doctype:
break;
case 13:
#line 353 "htmlparse.y"
#line 354 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
@ -1373,7 +1374,7 @@ finish_script:
break;
case 14:
#line 375 "htmlparse.y"
#line 376 "htmlparse.y"
{
/* $1 is a PyString */
UserData* ud = yyget_extra(scanner);
@ -1398,7 +1399,7 @@ finish_style:
break;
case 15:
#line 397 "htmlparse.y"
#line 398 "htmlparse.y"
{
/* $1 is a PyString */
/* Remember this is also called as a lexer error fallback */
@ -1426,7 +1427,7 @@ finish_characters:
}
/* Line 999 of yacc.c. */
#line 1430 "htmlparse.c"
#line 1431 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1620,7 +1621,7 @@ yyreturn:
}
#line 420 "htmlparse.y"
#line 421 "htmlparse.y"
/* disable python memory interface */
@ -1635,7 +1636,8 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
{
return NULL;
}
self->handler = NULL;
Py_INCREF(Py_None);
self->handler = Py_None;
/* reset userData */
self->userData = PyMem_New(UserData, sizeof(UserData));
if (self->userData == NULL)
@ -1643,7 +1645,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
Py_DECREF(self);
return NULL;
}
self->userData->handler = NULL;
self->userData->handler = self->handler;
self->userData->buf = NULL;
CLEAR_BUF_DECREF(self, self->userData->buf);
self->userData->nextpos = 0;
@ -1659,6 +1661,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
self->userData->tmp_attrval = self->userData->tmp_attrs =
self->userData->lexbuf = NULL;
self->userData->resolve_entities = resolve_entities;
self->userData->sorted_dict = sorted_dict;
self->userData->exc_type = NULL;
self->userData->exc_val = NULL;
self->userData->exc_tb = NULL;
@ -1675,10 +1678,15 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
/* initialize parser object */
static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
PyObject* handler;
if (!PyArg_ParseTuple(args, "O", &handler)) {
return -1;
PyObject* handler = NULL;
static char *kwlist[] = {"handler", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
return -1;
}
if (handler==NULL) {
return 0;
}
Py_DECREF(self->handler);
Py_INCREF(handler);
self->handler = handler;
self->userData->handler = self->handler;
@ -1688,7 +1696,7 @@ static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
/* traverse all used subobjects participating in reference cycles */
static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
if (self->handler && visit(self->handler, arg) < 0) {
if (visit(self->handler, arg) < 0) {
return -1;
}
return 0;
@ -1764,7 +1772,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
self->userData->bufpos = 0;
if (strlen(self->userData->buf)) {
// XXX set line, col
/* XXX set line, col */
int error = 0;
PyObject* s = PyString_FromString(self->userData->buf);
PyObject* callback = NULL;
@ -1892,11 +1900,32 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
}
static PyObject* parser_gethandler (parser_object* self, void* closure) {
Py_INCREF(self->handler);
return self->handler;
}
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
return -1;
}
Py_DECREF(self->handler);
Py_INCREF(value);
self->handler = value;
self->userData->handler = self->handler;
return 0;
}
/* type interface */
static PyMemberDef parser_members[] = {
{"handler", T_OBJECT_EX, offsetof(parser_object, handler), 0,
"handler class"},
{NULL} /* Sentinel */
};
static PyGetSetDef parser_getset[] = {
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
"handler object", NULL},
{NULL} /* Sentinel */
};
@ -1947,7 +1976,7 @@ static PyTypeObject parser_type = {
0, /* tp_iternext */
parser_methods, /* tp_methods */
parser_members, /* tp_members */
0, /* tp_getset */
parser_getset, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
@ -1960,7 +1989,7 @@ static PyTypeObject parser_type = {
/* python module interface
"Create a new HTML parser object with given handler.\n"
"Create a new HTML parser object with handler (which may be None).\n"
"\n"
"Used callbacks (they don't have to be defined) of a handler are:\n"
"comment(data): <!--data-->\n"
@ -2000,11 +2029,14 @@ PyMODINIT_FUNC inithtmlsax (void) {
/* init error */
PyErr_Print();
}
if ((m = PyImport_ImportModule("linkcheck.parser"))==NULL) {
if ((m = PyImport_ImportModule("wc.parser"))==NULL) {
return;
}
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
return;
}
if ((sorted_dict = PyObject_GetAttrString(m, "SortedDict"))==NULL) {
return;
}
}

View file

@ -66,10 +66,12 @@ typedef struct {
PyObject* tmp_attrname;
/* temporary HTML start tag attribute value */
PyObject* tmp_attrval;
/* temporary HTML start tag attribute list */
/* temporary HTML start tag attribute list (a SortedDict) */
PyObject* tmp_attrs;
/* parser.resolve_entities */
PyObject* resolve_entities;
/* parser.SortedDict */
PyObject* sorted_dict;
/* stored Python exception (if error occurred in scanner) */
PyObject* exc_type;
PyObject* exc_val;