XHTML support

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2108 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-12-08 09:09:06 +00:00
parent 6cec856dba
commit 4fbdbe3a51
7 changed files with 186 additions and 53 deletions

View file

@ -1,3 +1,9 @@
2.1 "" (released xx.xx.xxxx)
* Added XHTML support to the HTML parser.
Type: feature
Changed: linkcheck/HtmlParser/*
2.0 "I Kina spiser de hunde" (released 7.12.2004)
* Regenerate the HTML parser with new Bison version 1.875d.

View file

@ -7,7 +7,7 @@ PYTHON = python$(PYVER)
all: htmllex.c htmlparse.c
%.o: %.c
gcc -g -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
gcc -g -std=c99 -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
bison htmlparse.y

View file

@ -66,6 +66,7 @@
"""
import re
import codecs
import htmlentitydefs
@ -126,3 +127,28 @@ def strip_quotes (s):
return s[1:-1]
return s
_encoding_ro = re.compile(r"charset=(?P<encoding>[-0-9a-zA-Z]+)")
def set_encoding (self, tag, attrs):
"""Set document encoding for given parser. Tag must be a meta tag."""
if tag != u'meta':
return
if attrs.get('http-equiv', u'').lower() == u"content-type":
content = attrs.get('content', u'')
mo = _encoding_ro.search(content)
if mo:
encoding = mo.group("encoding").encode("ascii")
try:
encoding = encoding.encode("ascii")
codecs.lookup(encoding)
self.encoding = encoding
except LookupError:
# ignore unknown encodings
pass
def set_doctype (self, doctype):
if u"XHTML" in doctype:
self.doctype = "XHTML"

View file

@ -38,8 +38,15 @@
/* make python unicode string from tmp_buf and assign it to a */
#define PYSTRING_TMP(a) { \
const char* enc = PyString_AsString(yyextra->encoding); \
(a) = PyUnicode_Decode(yyextra->tmp_buf, strlen(yyextra->tmp_buf), enc, "ignore"); \
PyObject* pencoding; \
char* encoding; \
pencoding = PyObject_GetAttrString(yyextra->parser, "encoding"); \
if (pencoding==NULL) return T_ERROR; \
encoding = PyString_AsString(pencoding); \
if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
(a) = PyUnicode_Decode(yyextra->tmp_buf, strlen(yyextra->tmp_buf), \
encoding, "ignore"); \
Py_DECREF(pencoding); \
if ((a)==NULL) return T_ERROR; \
}
@ -655,6 +662,14 @@ RX_DATA [-a-zA-Z0-9_:]+
RETURN(T_ELEMENT_START_END);
}
<S_ATTR1,S_ATTR2,S_ATTR3>\/[^>] {
UPDATE_LINE;
}
<S_ATTR1,S_ATTR2,S_ATTR3>\/ {
return T_WAIT;
}
<S_ATTR1,S_ATTR2,S_ATTR3>> {
UPDATE_COLUMN;
FLUSH_ATTRS;
@ -1001,13 +1016,19 @@ int htmllexStart (void* scanner, UserData* data, const char* s, int slen) {
we must replace NUL with ' '. */
int len = strlen(data->buf);
int i;
RESIZE_BUF(data->buf, len+slen + 1);
for (i=0; i<slen; i++) {
data->buf[len+i] = (s[i]==0 ? ' ' : s[i]);
RESIZE_BUF(data->buf, len + slen + 1);
for (i=0; i < slen; i++) {
data->buf[len+i] = (s[i]=='\0' ? ' ' : s[i]);
}
data->buf[len+slen] = '\0';
if (yyget_debug(scanner)) {
fprintf(stderr, "SCANBUF %d `%s'\n", data->bufpos, data->buf);
}
data->buf[len+slen] = 0;
if (len > data->bufpos) {
int rewind = len - data->bufpos;
if (yyget_debug(scanner)) {
fprintf(stderr, "REWIND %d\n", rewind);
}
slen += rewind;
len -= rewind;
}

View file

@ -67,6 +67,13 @@ class HtmlPrettyPrinter (object):
def start_element (self, tag, attrs):
"""print start element"""
self._start_element(tag, attrs, ">")
def start_end_element (self, tag, attrs):
"""print combined start-end element"""
self._start_element(tag, attrs, "/>")
def _start_element (self, tag, attrs, end):
tag = tag.encode(self.encoding, "ignore")
self.fd.write("<%s" % tag.replace("/", ""))
for key, val in attrs.iteritems():
@ -76,7 +83,7 @@ class HtmlPrettyPrinter (object):
else:
val = val.encode(self.encoding, "ignore")
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
self.fd.write(">")
self.fd.write(end)
def end_element (self, tag):
"""print end element"""

View file

@ -44,25 +44,15 @@ static int yyerror (char* msg) {
/* parser.resolve_entities */
static PyObject* resolve_entities;
/* ListDict class, sorted dictionary */
static PyObject* list_dict;
/* set_encoding helper function */
static PyObject* set_encoding;
/* set_doctype helper function */
static PyObject* set_doctype;
/* macros for easier scanner state manipulation */
/* test whether tag does not need an HTML end tag */
#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
strcmp(tag, "base")==0 || \
strcmp(tag, "basefont")==0 || \
strcmp(tag, "br")==0 || \
strcmp(tag, "col")==0 || \
strcmp(tag, "frame")==0 || \
strcmp(tag, "hr")==0 || \
strcmp(tag, "img")==0 || \
strcmp(tag, "input")==0 || \
strcmp(tag, "isindex")==0 || \
strcmp(tag, "link")==0 || \
strcmp(tag, "meta")==0 || \
strcmp(tag, "param")==0)
/* clear buffer b, returning NULL on error */
#define CLEAR_BUF(b) \
b = PyMem_Resize(b, char, 1); \
@ -103,8 +93,12 @@ static PyObject* list_dict;
/* parser type definition */
typedef struct {
PyObject_HEAD
/* the handler object */
PyObject* handler;
/* the charset encoding (PyStringObject) */
PyObject* encoding;
/* the document type (PyStringObject) */
PyObject* doctype;
UserData* userData;
void* scanner;
} parser_object;
@ -115,6 +109,37 @@ staticforward PyTypeObject parser_type;
#define YYMALLOC PyMem_Malloc
#define YYFREE PyMem_Free
/* Test whether tag does not need an HTML end tag. Return -1 on error. */
static int html_end_tag (PyObject* ptag, PyObject* parser) {
PyObject* pdoctype;
char* doctype;
int ret = 1;
pdoctype = PyObject_GetAttrString(parser, "doctype");
if (pdoctype==NULL) return -1;
doctype = PyString_AsString(pdoctype);
if (doctype == NULL) { Py_DECREF(pdoctype); return -1; }
if (strcmp(doctype, "HTML")==0) {
char* tag = PyString_AsString(ptag);
if (tag == NULL) { Py_DECREF(pdoctype); return -1; }
ret = strcmp(tag, "area")!=0 &&
strcmp(tag, "base")!=0 &&
strcmp(tag, "basefont")!=0 &&
strcmp(tag, "br")!=0 &&
strcmp(tag, "col")!=0 &&
strcmp(tag, "frame")!=0 &&
strcmp(tag, "hr")!=0 &&
strcmp(tag, "img")!=0 &&
strcmp(tag, "input")!=0 &&
strcmp(tag, "isindex")!=0 &&
strcmp(tag, "link")!=0 &&
strcmp(tag, "meta")!=0 &&
strcmp(tag, "param")!=0;
}
/* it is not HTML (presumably XHTML) */
Py_DECREF(pdoctype);
return ret;
}
%}
/* parser options */
@ -163,7 +188,11 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
PyObject* tag = PyTuple_GET_ITEM($1, 0);
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
int error = 0;
if (!tag || !attrs) { error = 1; goto finish_start; }
if (tag==NULL || attrs==NULL) { error = 1; goto finish_start; }
/* set encoding */
result = PyObject_CallFunction(set_encoding, "OOO", ud->parser, tag, attrs);
if (result==NULL) { error=1; goto finish_start; }
Py_DECREF(result); result = NULL;
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
callback = PyObject_GetAttrString(ud->handler, "start_element");
if (!callback) { error=1; goto finish_start; }
@ -171,7 +200,7 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
if (!result) { error=1; goto finish_start; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
callback = result = NULL;
}
CHECK_ERROR(ud, finish_start);
finish_start:
@ -198,29 +227,27 @@ finish_start:
PyObject* tag = PyTuple_GET_ITEM($1, 0);
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
int error = 0;
PyObject* tagname = NULL;
if (!tag || !attrs) { error = 1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
callback = PyObject_GetAttrString(ud->handler, "start_element");
char* fname;
PyObject* tagname;
if (tag==NULL || attrs==NULL) { error = 1; goto finish_start_end; }
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_start_end; }
/* set encoding */
result = PyObject_CallFunction(set_encoding, "OOO", ud->parser, tag, attrs);
if (result==NULL) { error=1; goto finish_start_end; }
Py_DECREF(result); result = NULL;
if (html_end_tag(tagname, ud->parser))
fname = "start_end_element";
else
fname = "start_element";
if (PyObject_HasAttrString(ud->handler, fname)==1) {
callback = PyObject_GetAttrString(ud->handler, fname);
if (!callback) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "OO", tag, attrs);
if (!result) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
}
/* encode tagname in ASCII, ignoring any unknown chars */
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_start_end; }
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
NO_HTML_END_TAG(PyString_AsString(tagname))) {
callback = PyObject_GetAttrString(ud->handler, "end_element");
if (callback==NULL) { error=1; goto finish_start_end; }
result = PyObject_CallFunction(callback, "O", tag);
if (result==NULL) { error=1; goto finish_start_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
callback = result = NULL;
}
CHECK_ERROR(ud, finish_start_end);
finish_start_end:
@ -229,7 +256,6 @@ finish_start_end:
Py_XDECREF(callback);
Py_XDECREF(result);
Py_XDECREF(tag);
Py_XDECREF(tagname);
Py_XDECREF(attrs);
Py_DECREF($1);
if (error) {
@ -249,14 +275,14 @@ finish_start_end:
PyObject* tagname = PyUnicode_AsEncodedString($1, "ascii", "ignore");
if (tagname==NULL) { error=1; goto finish_end; }
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
NO_HTML_END_TAG(PyString_AsString(tagname))) {
html_end_tag(tagname, ud->parser)) {
callback = PyObject_GetAttrString(ud->handler, "end_element");
if (callback==NULL) { error=1; goto finish_end; }
result = PyObject_CallFunction(callback, "O", $1);
if (result==NULL) { error=1; goto finish_end; }
Py_DECREF(callback);
Py_DECREF(result);
callback=result=NULL;
callback = result = NULL;
}
CHECK_ERROR(ud, finish_end);
finish_end:
@ -342,6 +368,10 @@ finish_cdata:
PyObject* callback = NULL;
PyObject* result = NULL;
int error = 0;
/* set encoding */
result = PyObject_CallFunction(set_doctype, "OO", ud->parser, $1);
if (result==NULL) { error=1; goto finish_doctype; }
Py_DECREF(result); result = NULL;
CALLBACK(ud, "doctype", "O", $1, finish_doctype);
CHECK_ERROR(ud, finish_doctype);
finish_doctype:
@ -480,7 +510,14 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
Py_DECREF(self);
return NULL;
}
self->userData->encoding = self->encoding;
self->doctype = PyString_FromString("HTML");
if (self->doctype == NULL) {
Py_DECREF(self->encoding);
Py_DECREF(self->handler);
Py_DECREF(self);
return NULL;
}
self->userData->parser = (PyObject*)self;
return (PyObject*) self;
}
@ -525,9 +562,11 @@ static int parser_clear (parser_object* self) {
static void parser_dealloc (parser_object* self) {
htmllexDestroy(self->scanner);
parser_clear(self);
self->userData->encoding = NULL;
self->userData->parser = NULL;
Py_XDECREF(self->encoding);
self->encoding = NULL;
Py_XDECREF(self->doctype);
self->doctype = NULL;
PyMem_Del(self->userData->buf);
PyMem_Del(self->userData->tmp_buf);
PyMem_Del(self->userData);
@ -752,7 +791,28 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
Py_DECREF(self->encoding);
Py_INCREF(value);
self->encoding = value;
self->userData->encoding = value;
return 0;
}
static PyObject* parser_getdoctype (parser_object* self, void* closure) {
Py_INCREF(self->doctype);
return self->doctype;
}
static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
return -1;
}
if (!PyString_Check(value)) {
PyErr_SetString(PyExc_TypeError, "doctype must be string");
return -1;
}
Py_DECREF(self->doctype);
Py_INCREF(value);
self->doctype = value;
return 0;
}
@ -768,6 +828,8 @@ static PyGetSetDef parser_getset[] = {
"handler object", NULL},
{"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
"encoding", NULL},
{"doctype", (getter)parser_getdoctype, (setter)parser_setdoctype,
"doctype", NULL},
{NULL} /* Sentinel */
};
@ -886,6 +948,17 @@ PyMODINIT_FUNC inithtmlsax (void) {
Py_DECREF(m);
return;
}
if ((set_encoding = PyObject_GetAttrString(m, "set_encoding"))==NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(m);
return;
}
if ((set_doctype = PyObject_GetAttrString(m, "set_doctype"))==NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(set_encoding);
Py_DECREF(m);
return;
}
Py_DECREF(m);
if ((m = PyImport_ImportModule("linkcheck.containers"))==NULL) {
return;

View file

@ -68,9 +68,9 @@ typedef struct {
PyObject* tmp_attrval;
/* temporary HTML start tag attribute list (a SortedDict) */
PyObject* tmp_attrs;
/* parser.resolve_entities */
/* HtmlParser.resolve_entities */
PyObject* resolve_entities;
/* parser.SortedDict */
/* HtmlParser.SortedDict */
PyObject* list_dict;
/* stored Python exception (if error occurred in scanner) */
PyObject* exc_type;
@ -78,8 +78,8 @@ typedef struct {
PyObject* exc_tb;
/* error string */
PyObject* error;
/* encoding string (default iso8859-1) */
PyObject* encoding;
/* the parser object itself */
PyObject* parser;
} UserData;
#endif