mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-17 12:51:01 +00:00
XHTML support
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2108 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
6cec856dba
commit
4fbdbe3a51
7 changed files with 186 additions and 53 deletions
|
|
@ -1,3 +1,9 @@
|
|||
2.1 "" (released xx.xx.xxxx)
|
||||
|
||||
* Added XHTML support to the HTML parser.
|
||||
Type: feature
|
||||
Changed: linkcheck/HtmlParser/*
|
||||
|
||||
2.0 "I Kina spiser de hunde" (released 7.12.2004)
|
||||
|
||||
* Regenerate the HTML parser with new Bison version 1.875d.
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ PYTHON = python$(PYVER)
|
|||
all: htmllex.c htmlparse.c
|
||||
|
||||
%.o: %.c
|
||||
gcc -g -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
|
||||
gcc -g -std=c99 -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. -I/usr/include/$(PYTHON) -c $< -o $@
|
||||
|
||||
htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
|
||||
bison htmlparse.y
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@
|
|||
"""
|
||||
|
||||
import re
|
||||
import codecs
|
||||
import htmlentitydefs
|
||||
|
||||
|
||||
|
|
@ -126,3 +127,28 @@ def strip_quotes (s):
|
|||
return s[1:-1]
|
||||
return s
|
||||
|
||||
|
||||
_encoding_ro = re.compile(r"charset=(?P<encoding>[-0-9a-zA-Z]+)")
|
||||
|
||||
def set_encoding (self, tag, attrs):
|
||||
"""Set document encoding for given parser. Tag must be a meta tag."""
|
||||
if tag != u'meta':
|
||||
return
|
||||
if attrs.get('http-equiv', u'').lower() == u"content-type":
|
||||
content = attrs.get('content', u'')
|
||||
mo = _encoding_ro.search(content)
|
||||
if mo:
|
||||
encoding = mo.group("encoding").encode("ascii")
|
||||
try:
|
||||
encoding = encoding.encode("ascii")
|
||||
codecs.lookup(encoding)
|
||||
self.encoding = encoding
|
||||
except LookupError:
|
||||
# ignore unknown encodings
|
||||
pass
|
||||
|
||||
|
||||
def set_doctype (self, doctype):
|
||||
if u"XHTML" in doctype:
|
||||
self.doctype = "XHTML"
|
||||
|
||||
|
|
|
|||
|
|
@ -38,8 +38,15 @@
|
|||
|
||||
/* make python unicode string from tmp_buf and assign it to a */
|
||||
#define PYSTRING_TMP(a) { \
|
||||
const char* enc = PyString_AsString(yyextra->encoding); \
|
||||
(a) = PyUnicode_Decode(yyextra->tmp_buf, strlen(yyextra->tmp_buf), enc, "ignore"); \
|
||||
PyObject* pencoding; \
|
||||
char* encoding; \
|
||||
pencoding = PyObject_GetAttrString(yyextra->parser, "encoding"); \
|
||||
if (pencoding==NULL) return T_ERROR; \
|
||||
encoding = PyString_AsString(pencoding); \
|
||||
if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
|
||||
(a) = PyUnicode_Decode(yyextra->tmp_buf, strlen(yyextra->tmp_buf), \
|
||||
encoding, "ignore"); \
|
||||
Py_DECREF(pencoding); \
|
||||
if ((a)==NULL) return T_ERROR; \
|
||||
}
|
||||
|
||||
|
|
@ -655,6 +662,14 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
RETURN(T_ELEMENT_START_END);
|
||||
}
|
||||
|
||||
<S_ATTR1,S_ATTR2,S_ATTR3>\/[^>] {
|
||||
UPDATE_LINE;
|
||||
}
|
||||
|
||||
<S_ATTR1,S_ATTR2,S_ATTR3>\/ {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
||||
<S_ATTR1,S_ATTR2,S_ATTR3>> {
|
||||
UPDATE_COLUMN;
|
||||
FLUSH_ATTRS;
|
||||
|
|
@ -1001,13 +1016,19 @@ int htmllexStart (void* scanner, UserData* data, const char* s, int slen) {
|
|||
we must replace NUL with ' '. */
|
||||
int len = strlen(data->buf);
|
||||
int i;
|
||||
RESIZE_BUF(data->buf, len+slen + 1);
|
||||
for (i=0; i<slen; i++) {
|
||||
data->buf[len+i] = (s[i]==0 ? ' ' : s[i]);
|
||||
RESIZE_BUF(data->buf, len + slen + 1);
|
||||
for (i=0; i < slen; i++) {
|
||||
data->buf[len+i] = (s[i]=='\0' ? ' ' : s[i]);
|
||||
}
|
||||
data->buf[len+slen] = '\0';
|
||||
if (yyget_debug(scanner)) {
|
||||
fprintf(stderr, "SCANBUF %d `%s'\n", data->bufpos, data->buf);
|
||||
}
|
||||
data->buf[len+slen] = 0;
|
||||
if (len > data->bufpos) {
|
||||
int rewind = len - data->bufpos;
|
||||
if (yyget_debug(scanner)) {
|
||||
fprintf(stderr, "REWIND %d\n", rewind);
|
||||
}
|
||||
slen += rewind;
|
||||
len -= rewind;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,6 +67,13 @@ class HtmlPrettyPrinter (object):
|
|||
|
||||
def start_element (self, tag, attrs):
|
||||
"""print start element"""
|
||||
self._start_element(tag, attrs, ">")
|
||||
|
||||
def start_end_element (self, tag, attrs):
|
||||
"""print combined start-end element"""
|
||||
self._start_element(tag, attrs, "/>")
|
||||
|
||||
def _start_element (self, tag, attrs, end):
|
||||
tag = tag.encode(self.encoding, "ignore")
|
||||
self.fd.write("<%s" % tag.replace("/", ""))
|
||||
for key, val in attrs.iteritems():
|
||||
|
|
@ -76,7 +83,7 @@ class HtmlPrettyPrinter (object):
|
|||
else:
|
||||
val = val.encode(self.encoding, "ignore")
|
||||
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
|
||||
self.fd.write(">")
|
||||
self.fd.write(end)
|
||||
|
||||
def end_element (self, tag):
|
||||
"""print end element"""
|
||||
|
|
|
|||
|
|
@ -44,25 +44,15 @@ static int yyerror (char* msg) {
|
|||
|
||||
/* parser.resolve_entities */
|
||||
static PyObject* resolve_entities;
|
||||
/* ListDict class, sorted dictionary */
|
||||
static PyObject* list_dict;
|
||||
/* set_encoding helper function */
|
||||
static PyObject* set_encoding;
|
||||
/* set_doctype helper function */
|
||||
static PyObject* set_doctype;
|
||||
|
||||
/* macros for easier scanner state manipulation */
|
||||
|
||||
/* test whether tag does not need an HTML end tag */
|
||||
#define NO_HTML_END_TAG(tag) !(strcmp(tag, "area")==0 || \
|
||||
strcmp(tag, "base")==0 || \
|
||||
strcmp(tag, "basefont")==0 || \
|
||||
strcmp(tag, "br")==0 || \
|
||||
strcmp(tag, "col")==0 || \
|
||||
strcmp(tag, "frame")==0 || \
|
||||
strcmp(tag, "hr")==0 || \
|
||||
strcmp(tag, "img")==0 || \
|
||||
strcmp(tag, "input")==0 || \
|
||||
strcmp(tag, "isindex")==0 || \
|
||||
strcmp(tag, "link")==0 || \
|
||||
strcmp(tag, "meta")==0 || \
|
||||
strcmp(tag, "param")==0)
|
||||
|
||||
/* clear buffer b, returning NULL on error */
|
||||
#define CLEAR_BUF(b) \
|
||||
b = PyMem_Resize(b, char, 1); \
|
||||
|
|
@ -103,8 +93,12 @@ static PyObject* list_dict;
|
|||
/* parser type definition */
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
/* the handler object */
|
||||
PyObject* handler;
|
||||
/* the charset encoding (PyStringObject) */
|
||||
PyObject* encoding;
|
||||
/* the document type (PyStringObject) */
|
||||
PyObject* doctype;
|
||||
UserData* userData;
|
||||
void* scanner;
|
||||
} parser_object;
|
||||
|
|
@ -115,6 +109,37 @@ staticforward PyTypeObject parser_type;
|
|||
#define YYMALLOC PyMem_Malloc
|
||||
#define YYFREE PyMem_Free
|
||||
|
||||
/* Test whether tag does not need an HTML end tag. Return -1 on error. */
|
||||
static int html_end_tag (PyObject* ptag, PyObject* parser) {
|
||||
PyObject* pdoctype;
|
||||
char* doctype;
|
||||
int ret = 1;
|
||||
pdoctype = PyObject_GetAttrString(parser, "doctype");
|
||||
if (pdoctype==NULL) return -1;
|
||||
doctype = PyString_AsString(pdoctype);
|
||||
if (doctype == NULL) { Py_DECREF(pdoctype); return -1; }
|
||||
if (strcmp(doctype, "HTML")==0) {
|
||||
char* tag = PyString_AsString(ptag);
|
||||
if (tag == NULL) { Py_DECREF(pdoctype); return -1; }
|
||||
ret = strcmp(tag, "area")!=0 &&
|
||||
strcmp(tag, "base")!=0 &&
|
||||
strcmp(tag, "basefont")!=0 &&
|
||||
strcmp(tag, "br")!=0 &&
|
||||
strcmp(tag, "col")!=0 &&
|
||||
strcmp(tag, "frame")!=0 &&
|
||||
strcmp(tag, "hr")!=0 &&
|
||||
strcmp(tag, "img")!=0 &&
|
||||
strcmp(tag, "input")!=0 &&
|
||||
strcmp(tag, "isindex")!=0 &&
|
||||
strcmp(tag, "link")!=0 &&
|
||||
strcmp(tag, "meta")!=0 &&
|
||||
strcmp(tag, "param")!=0;
|
||||
}
|
||||
/* it is not HTML (presumably XHTML) */
|
||||
Py_DECREF(pdoctype);
|
||||
return ret;
|
||||
}
|
||||
|
||||
%}
|
||||
|
||||
/* parser options */
|
||||
|
|
@ -163,7 +188,11 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
|
|||
PyObject* tag = PyTuple_GET_ITEM($1, 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
|
||||
int error = 0;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start; }
|
||||
if (tag==NULL || attrs==NULL) { error = 1; goto finish_start; }
|
||||
/* set encoding */
|
||||
result = PyObject_CallFunction(set_encoding, "OOO", ud->parser, tag, attrs);
|
||||
if (result==NULL) { error=1; goto finish_start; }
|
||||
Py_DECREF(result); result = NULL;
|
||||
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "start_element");
|
||||
if (!callback) { error=1; goto finish_start; }
|
||||
|
|
@ -171,7 +200,7 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
|
|||
if (!result) { error=1; goto finish_start; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
callback = result = NULL;
|
||||
}
|
||||
CHECK_ERROR(ud, finish_start);
|
||||
finish_start:
|
||||
|
|
@ -198,29 +227,27 @@ finish_start:
|
|||
PyObject* tag = PyTuple_GET_ITEM($1, 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
|
||||
int error = 0;
|
||||
PyObject* tagname = NULL;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "start_element");
|
||||
char* fname;
|
||||
PyObject* tagname;
|
||||
if (tag==NULL || attrs==NULL) { error = 1; goto finish_start_end; }
|
||||
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
|
||||
if (tagname==NULL) { error=1; goto finish_start_end; }
|
||||
/* set encoding */
|
||||
result = PyObject_CallFunction(set_encoding, "OOO", ud->parser, tag, attrs);
|
||||
if (result==NULL) { error=1; goto finish_start_end; }
|
||||
Py_DECREF(result); result = NULL;
|
||||
if (html_end_tag(tagname, ud->parser))
|
||||
fname = "start_end_element";
|
||||
else
|
||||
fname = "start_element";
|
||||
if (PyObject_HasAttrString(ud->handler, fname)==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, fname);
|
||||
if (!callback) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "OO", tag, attrs);
|
||||
if (!result) { error=1; goto finish_start_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
/* encode tagname in ASCII, ignoring any unknown chars */
|
||||
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
|
||||
if (tagname==NULL) { error=1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
|
||||
NO_HTML_END_TAG(PyString_AsString(tagname))) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "end_element");
|
||||
if (callback==NULL) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "O", tag);
|
||||
if (result==NULL) { error=1; goto finish_start_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
callback = result = NULL;
|
||||
}
|
||||
CHECK_ERROR(ud, finish_start_end);
|
||||
finish_start_end:
|
||||
|
|
@ -229,7 +256,6 @@ finish_start_end:
|
|||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(tag);
|
||||
Py_XDECREF(tagname);
|
||||
Py_XDECREF(attrs);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
|
|
@ -249,14 +275,14 @@ finish_start_end:
|
|||
PyObject* tagname = PyUnicode_AsEncodedString($1, "ascii", "ignore");
|
||||
if (tagname==NULL) { error=1; goto finish_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
|
||||
NO_HTML_END_TAG(PyString_AsString(tagname))) {
|
||||
html_end_tag(tagname, ud->parser)) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "end_element");
|
||||
if (callback==NULL) { error=1; goto finish_end; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
if (result==NULL) { error=1; goto finish_end; }
|
||||
Py_DECREF(callback);
|
||||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
callback = result = NULL;
|
||||
}
|
||||
CHECK_ERROR(ud, finish_end);
|
||||
finish_end:
|
||||
|
|
@ -342,6 +368,10 @@ finish_cdata:
|
|||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
/* set encoding */
|
||||
result = PyObject_CallFunction(set_doctype, "OO", ud->parser, $1);
|
||||
if (result==NULL) { error=1; goto finish_doctype; }
|
||||
Py_DECREF(result); result = NULL;
|
||||
CALLBACK(ud, "doctype", "O", $1, finish_doctype);
|
||||
CHECK_ERROR(ud, finish_doctype);
|
||||
finish_doctype:
|
||||
|
|
@ -480,7 +510,14 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->userData->encoding = self->encoding;
|
||||
self->doctype = PyString_FromString("HTML");
|
||||
if (self->doctype == NULL) {
|
||||
Py_DECREF(self->encoding);
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->userData->parser = (PyObject*)self;
|
||||
return (PyObject*) self;
|
||||
}
|
||||
|
||||
|
|
@ -525,9 +562,11 @@ static int parser_clear (parser_object* self) {
|
|||
static void parser_dealloc (parser_object* self) {
|
||||
htmllexDestroy(self->scanner);
|
||||
parser_clear(self);
|
||||
self->userData->encoding = NULL;
|
||||
self->userData->parser = NULL;
|
||||
Py_XDECREF(self->encoding);
|
||||
self->encoding = NULL;
|
||||
Py_XDECREF(self->doctype);
|
||||
self->doctype = NULL;
|
||||
PyMem_Del(self->userData->buf);
|
||||
PyMem_Del(self->userData->tmp_buf);
|
||||
PyMem_Del(self->userData);
|
||||
|
|
@ -752,7 +791,28 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
|
|||
Py_DECREF(self->encoding);
|
||||
Py_INCREF(value);
|
||||
self->encoding = value;
|
||||
self->userData->encoding = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static PyObject* parser_getdoctype (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->doctype);
|
||||
return self->doctype;
|
||||
}
|
||||
|
||||
|
||||
static int parser_setdoctype (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
|
||||
return -1;
|
||||
}
|
||||
if (!PyString_Check(value)) {
|
||||
PyErr_SetString(PyExc_TypeError, "doctype must be string");
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(self->doctype);
|
||||
Py_INCREF(value);
|
||||
self->doctype = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -768,6 +828,8 @@ static PyGetSetDef parser_getset[] = {
|
|||
"handler object", NULL},
|
||||
{"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
|
||||
"encoding", NULL},
|
||||
{"doctype", (getter)parser_getdoctype, (setter)parser_setdoctype,
|
||||
"doctype", NULL},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
|
@ -886,6 +948,17 @@ PyMODINIT_FUNC inithtmlsax (void) {
|
|||
Py_DECREF(m);
|
||||
return;
|
||||
}
|
||||
if ((set_encoding = PyObject_GetAttrString(m, "set_encoding"))==NULL) {
|
||||
Py_DECREF(resolve_entities);
|
||||
Py_DECREF(m);
|
||||
return;
|
||||
}
|
||||
if ((set_doctype = PyObject_GetAttrString(m, "set_doctype"))==NULL) {
|
||||
Py_DECREF(resolve_entities);
|
||||
Py_DECREF(set_encoding);
|
||||
Py_DECREF(m);
|
||||
return;
|
||||
}
|
||||
Py_DECREF(m);
|
||||
if ((m = PyImport_ImportModule("linkcheck.containers"))==NULL) {
|
||||
return;
|
||||
|
|
|
|||
|
|
@ -68,9 +68,9 @@ typedef struct {
|
|||
PyObject* tmp_attrval;
|
||||
/* temporary HTML start tag attribute list (a SortedDict) */
|
||||
PyObject* tmp_attrs;
|
||||
/* parser.resolve_entities */
|
||||
/* HtmlParser.resolve_entities */
|
||||
PyObject* resolve_entities;
|
||||
/* parser.SortedDict */
|
||||
/* HtmlParser.SortedDict */
|
||||
PyObject* list_dict;
|
||||
/* stored Python exception (if error occurred in scanner) */
|
||||
PyObject* exc_type;
|
||||
|
|
@ -78,8 +78,8 @@ typedef struct {
|
|||
PyObject* exc_tb;
|
||||
/* error string */
|
||||
PyObject* error;
|
||||
/* encoding string (default iso8859-1) */
|
||||
PyObject* encoding;
|
||||
/* the parser object itself */
|
||||
PyObject* parser;
|
||||
} UserData;
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in a new issue