mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-17 02:51:07 +00:00
updated
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1296 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
672e118d9b
commit
93253954a8
4 changed files with 1744 additions and 1756 deletions
File diff suppressed because it is too large
Load diff
|
|
@ -20,19 +20,16 @@ __version__ = "$Revision$"[11:-2]
|
|||
__date__ = "$Date$"[7:-2]
|
||||
|
||||
import sys
|
||||
try:
|
||||
import htmlsax
|
||||
except ImportError, msg:
|
||||
exctype, value = sys.exc_info()[:2]
|
||||
print >>sys.stderr, "Could not import the parser module `htmlsax':", value
|
||||
print >>sys.stderr, "Please check your installation of LinkChecker."
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class HtmlPrinter (object):
|
||||
"""handles all functions by printing the function name and attributes"""
|
||||
def __init__ (self, fd=sys.stdout):
|
||||
self.fd = fd
|
||||
|
||||
|
||||
def _print (self, *attrs):
|
||||
print self.mem, attrs
|
||||
print >> self.fd, self.mem, attrs
|
||||
|
||||
|
||||
def _errorfun (self, msg, name):
|
||||
|
|
@ -61,43 +58,46 @@ class HtmlPrinter (object):
|
|||
return self._print
|
||||
|
||||
|
||||
class HtmlPrettyPrinter (object):
|
||||
def __init__ (self, fd=sys.stdout):
|
||||
self.fd = fd
|
||||
|
||||
|
||||
def comment (self, data):
|
||||
self.fd.write("<!--%s-->" % data)
|
||||
|
||||
|
||||
def startElement (self, tag, attrs):
|
||||
self.fd.write("<%s"%tag.replace("/", ""))
|
||||
for key, val in attrs.iteritems():
|
||||
if val is None:
|
||||
self.fd.write(" %s"%key)
|
||||
else:
|
||||
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
|
||||
self.fd.write(">")
|
||||
|
||||
|
||||
def endElement (self, tag):
|
||||
self.fd.write("</%s>" % tag)
|
||||
|
||||
|
||||
def doctype (self, data):
|
||||
self.fd.write("<!DOCTYPE%s>" % data)
|
||||
|
||||
|
||||
def pi (self, data):
|
||||
self.fd.write("<?%s?>" % data)
|
||||
|
||||
|
||||
def cdata (self, data):
|
||||
self.fd.write("<![CDATA[%s]]>"%data)
|
||||
|
||||
|
||||
def characters (self, data):
|
||||
self.fd.write(data)
|
||||
|
||||
|
||||
def quote_attrval (val):
|
||||
"""quote a HTML attribute to be able to wrap it in double quotes"""
|
||||
return val.replace('"', '"')
|
||||
|
||||
|
||||
def _test():
|
||||
p = htmlsax.parser(HtmlPrinter())
|
||||
p.feed("<hTml>")
|
||||
p.feed("<a href>")
|
||||
p.feed("<a href=''>")
|
||||
p.feed('<a href="">')
|
||||
p.feed("<a href='a'>")
|
||||
p.feed('<a href="a">')
|
||||
p.feed("<a href=a>")
|
||||
p.feed("<a href='\"'>")
|
||||
p.feed("<a href=\"'\">")
|
||||
p.feed("<a href=' '>")
|
||||
p.feed("<a href=a href=b>")
|
||||
p.feed("<a/>")
|
||||
p.feed("<a href/>")
|
||||
p.feed("<a href=a />")
|
||||
p.feed("</a>")
|
||||
p.feed("<?bla foo?>")
|
||||
p.feed("<?bla?>")
|
||||
p.feed("<!-- - comment -->")
|
||||
p.feed("<!---->")
|
||||
p.feed("<!DOCTYPE \"vla foo>")
|
||||
p.flush()
|
||||
|
||||
def _broken ():
|
||||
p = htmlsax.parser(HtmlPrinter())
|
||||
# turn on debugging
|
||||
p.debug(1)
|
||||
p.feed("""<base href="http://www.msnbc.com/news/">""")
|
||||
p.flush()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#_test()
|
||||
_broken()
|
||||
|
|
|
|||
|
|
@ -131,6 +131,7 @@ static int yyerror (char* msg) {
|
|||
|
||||
/* parser.resolve_entities */
|
||||
static PyObject* resolve_entities;
|
||||
static PyObject* sorted_dict;
|
||||
|
||||
/* macros for easier scanner state manipulation */
|
||||
|
||||
|
|
@ -229,7 +230,7 @@ typedef int YYSTYPE;
|
|||
|
||||
|
||||
/* Line 214 of yacc.c. */
|
||||
#line 233 "htmlparse.c"
|
||||
#line 234 "htmlparse.c"
|
||||
|
||||
#if ! defined (yyoverflow) || YYERROR_VERBOSE
|
||||
|
||||
|
|
@ -399,8 +400,8 @@ static const yysigned_char yyrhs[] =
|
|||
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
|
||||
static const unsigned short yyrline[] =
|
||||
{
|
||||
0, 143, 143, 144, 147, 148, 155, 190, 237, 268,
|
||||
289, 310, 331, 352, 374, 396
|
||||
0, 144, 144, 145, 148, 149, 156, 191, 238, 269,
|
||||
290, 311, 332, 353, 375, 397
|
||||
};
|
||||
#endif
|
||||
|
||||
|
|
@ -1105,22 +1106,22 @@ yyreduce:
|
|||
switch (yyn)
|
||||
{
|
||||
case 2:
|
||||
#line 143 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
#line 144 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
#line 145 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 4:
|
||||
#line 147 "htmlparse.y"
|
||||
#line 148 "htmlparse.y"
|
||||
{ YYACCEPT; /* wait for more lexer input */ ;}
|
||||
break;
|
||||
|
||||
case 5:
|
||||
#line 149 "htmlparse.y"
|
||||
#line 150 "htmlparse.y"
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1130,7 +1131,7 @@ yyreduce:
|
|||
break;
|
||||
|
||||
case 6:
|
||||
#line 156 "htmlparse.y"
|
||||
#line 157 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
|
|
@ -1168,7 +1169,7 @@ finish_start:
|
|||
break;
|
||||
|
||||
case 7:
|
||||
#line 191 "htmlparse.y"
|
||||
#line 192 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
|
|
@ -1218,7 +1219,7 @@ finish_start_end:
|
|||
break;
|
||||
|
||||
case 8:
|
||||
#line 238 "htmlparse.y"
|
||||
#line 239 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1252,7 +1253,7 @@ finish_end:
|
|||
break;
|
||||
|
||||
case 9:
|
||||
#line 269 "htmlparse.y"
|
||||
#line 270 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1276,7 +1277,7 @@ finish_comment:
|
|||
break;
|
||||
|
||||
case 10:
|
||||
#line 290 "htmlparse.y"
|
||||
#line 291 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1300,7 +1301,7 @@ finish_pi:
|
|||
break;
|
||||
|
||||
case 11:
|
||||
#line 311 "htmlparse.y"
|
||||
#line 312 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1324,7 +1325,7 @@ finish_cdata:
|
|||
break;
|
||||
|
||||
case 12:
|
||||
#line 332 "htmlparse.y"
|
||||
#line 333 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1348,7 +1349,7 @@ finish_doctype:
|
|||
break;
|
||||
|
||||
case 13:
|
||||
#line 353 "htmlparse.y"
|
||||
#line 354 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1373,7 +1374,7 @@ finish_script:
|
|||
break;
|
||||
|
||||
case 14:
|
||||
#line 375 "htmlparse.y"
|
||||
#line 376 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1398,7 +1399,7 @@ finish_style:
|
|||
break;
|
||||
|
||||
case 15:
|
||||
#line 397 "htmlparse.y"
|
||||
#line 398 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
|
|
@ -1426,7 +1427,7 @@ finish_characters:
|
|||
}
|
||||
|
||||
/* Line 999 of yacc.c. */
|
||||
#line 1430 "htmlparse.c"
|
||||
#line 1431 "htmlparse.c"
|
||||
|
||||
yyvsp -= yylen;
|
||||
yyssp -= yylen;
|
||||
|
|
@ -1620,7 +1621,7 @@ yyreturn:
|
|||
}
|
||||
|
||||
|
||||
#line 420 "htmlparse.y"
|
||||
#line 421 "htmlparse.y"
|
||||
|
||||
|
||||
/* disable python memory interface */
|
||||
|
|
@ -1635,7 +1636,8 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
{
|
||||
return NULL;
|
||||
}
|
||||
self->handler = NULL;
|
||||
Py_INCREF(Py_None);
|
||||
self->handler = Py_None;
|
||||
/* reset userData */
|
||||
self->userData = PyMem_New(UserData, sizeof(UserData));
|
||||
if (self->userData == NULL)
|
||||
|
|
@ -1643,7 +1645,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->userData->handler = NULL;
|
||||
self->userData->handler = self->handler;
|
||||
self->userData->buf = NULL;
|
||||
CLEAR_BUF_DECREF(self, self->userData->buf);
|
||||
self->userData->nextpos = 0;
|
||||
|
|
@ -1659,6 +1661,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
self->userData->tmp_attrval = self->userData->tmp_attrs =
|
||||
self->userData->lexbuf = NULL;
|
||||
self->userData->resolve_entities = resolve_entities;
|
||||
self->userData->sorted_dict = sorted_dict;
|
||||
self->userData->exc_type = NULL;
|
||||
self->userData->exc_val = NULL;
|
||||
self->userData->exc_tb = NULL;
|
||||
|
|
@ -1675,10 +1678,15 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
|
||||
/* initialize parser object */
|
||||
static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
|
||||
PyObject* handler;
|
||||
if (!PyArg_ParseTuple(args, "O", &handler)) {
|
||||
return -1;
|
||||
PyObject* handler = NULL;
|
||||
static char *kwlist[] = {"handler", NULL};
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &handler)) {
|
||||
return -1;
|
||||
}
|
||||
if (handler==NULL) {
|
||||
return 0;
|
||||
}
|
||||
Py_DECREF(self->handler);
|
||||
Py_INCREF(handler);
|
||||
self->handler = handler;
|
||||
self->userData->handler = self->handler;
|
||||
|
|
@ -1688,7 +1696,7 @@ static int parser_init (parser_object* self, PyObject* args, PyObject* kwds) {
|
|||
|
||||
/* traverse all used subobjects participating in reference cycles */
|
||||
static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
|
||||
if (self->handler && visit(self->handler, arg) < 0) {
|
||||
if (visit(self->handler, arg) < 0) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
|
|
@ -1764,7 +1772,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
|
|||
self->userData->tmp_attrval = self->userData->tmp_attrname = NULL;
|
||||
self->userData->bufpos = 0;
|
||||
if (strlen(self->userData->buf)) {
|
||||
// XXX set line, col
|
||||
/* XXX set line, col */
|
||||
int error = 0;
|
||||
PyObject* s = PyString_FromString(self->userData->buf);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1892,11 +1900,32 @@ static PyObject* parser_debug (parser_object* self, PyObject* args) {
|
|||
}
|
||||
|
||||
|
||||
static PyObject* parser_gethandler (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->handler);
|
||||
return self->handler;
|
||||
}
|
||||
|
||||
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(self->handler);
|
||||
Py_INCREF(value);
|
||||
self->handler = value;
|
||||
self->userData->handler = self->handler;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* type interface */
|
||||
|
||||
static PyMemberDef parser_members[] = {
|
||||
{"handler", T_OBJECT_EX, offsetof(parser_object, handler), 0,
|
||||
"handler class"},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyGetSetDef parser_getset[] = {
|
||||
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
|
||||
"handler object", NULL},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
|
@ -1947,7 +1976,7 @@ static PyTypeObject parser_type = {
|
|||
0, /* tp_iternext */
|
||||
parser_methods, /* tp_methods */
|
||||
parser_members, /* tp_members */
|
||||
0, /* tp_getset */
|
||||
parser_getset, /* tp_getset */
|
||||
0, /* tp_base */
|
||||
0, /* tp_dict */
|
||||
0, /* tp_descr_get */
|
||||
|
|
@ -1960,7 +1989,7 @@ static PyTypeObject parser_type = {
|
|||
|
||||
|
||||
/* python module interface
|
||||
"Create a new HTML parser object with given handler.\n"
|
||||
"Create a new HTML parser object with handler (which may be None).\n"
|
||||
"\n"
|
||||
"Used callbacks (they don't have to be defined) of a handler are:\n"
|
||||
"comment(data): <!--data-->\n"
|
||||
|
|
@ -2000,11 +2029,14 @@ PyMODINIT_FUNC inithtmlsax (void) {
|
|||
/* init error */
|
||||
PyErr_Print();
|
||||
}
|
||||
if ((m = PyImport_ImportModule("linkcheck.parser"))==NULL) {
|
||||
if ((m = PyImport_ImportModule("wc.parser"))==NULL) {
|
||||
return;
|
||||
}
|
||||
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
|
||||
return;
|
||||
}
|
||||
if ((sorted_dict = PyObject_GetAttrString(m, "SortedDict"))==NULL) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -66,10 +66,12 @@ typedef struct {
|
|||
PyObject* tmp_attrname;
|
||||
/* temporary HTML start tag attribute value */
|
||||
PyObject* tmp_attrval;
|
||||
/* temporary HTML start tag attribute list */
|
||||
/* temporary HTML start tag attribute list (a SortedDict) */
|
||||
PyObject* tmp_attrs;
|
||||
/* parser.resolve_entities */
|
||||
PyObject* resolve_entities;
|
||||
/* parser.SortedDict */
|
||||
PyObject* sorted_dict;
|
||||
/* stored Python exception (if error occurred in scanner) */
|
||||
PyObject* exc_type;
|
||||
PyObject* exc_val;
|
||||
|
|
|
|||
Loading…
Reference in a new issue