mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-04 21:04:41 +00:00
emit unicode data, store encoding
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1853 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
f2e2288f4b
commit
10209ae499
5 changed files with 156 additions and 73 deletions
|
|
@ -59,6 +59,10 @@
|
|||
The parser quotes all attribute values.
|
||||
Python memory management interface is used.
|
||||
|
||||
4. Character encoding aware
|
||||
|
||||
The parser itself is not encoding aware, but all the output are
|
||||
always Python Unicode strings.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
|
@ -79,30 +83,39 @@ def _resolve_ascii_entity (mo):
|
|||
radix = 10
|
||||
num = int(num, radix)
|
||||
# check 7-bit ASCII char range
|
||||
if 0<=num<=127:
|
||||
return chr(num)
|
||||
if 0 <= num <= 127:
|
||||
return unicode(chr(num))
|
||||
# not in range
|
||||
return ent
|
||||
|
||||
|
||||
_num_re = re.compile(ur'(?i)&#x?(?P<num>\d+);')
|
||||
def resolve_ascii_entities (s):
|
||||
"""resolve entities in 7-bit ASCII range to eliminate obfuscation"""
|
||||
return re.sub(r'(?i)&#x?(?P<num>\d+);', _resolve_ascii_entity, s)
|
||||
return _num_re.sub(_resolve_ascii_entity, s)
|
||||
|
||||
|
||||
def _resolve_html_entity (mo):
|
||||
"""resolve html entity, helper function for resolve_html_entities"""
|
||||
return htmlentitydefs.entitydefs.get(mo.group("entity"), mo.group())
|
||||
ent = mo.group("entity")
|
||||
s = mo.group()
|
||||
entdef = htmlentitydefs.entitydefs.get(ent)
|
||||
if entdef is None:
|
||||
return s
|
||||
# note: entdef is latin-1 encoded
|
||||
return entdef.decode("iso8859-1")
|
||||
|
||||
|
||||
_entity_re = re.compile(ur'(?i)&(?P<entity>[a-z]+);')
|
||||
def resolve_html_entities (s):
|
||||
"""resolve html entites in s and return result"""
|
||||
return re.sub(r'(?i)&(?P<entity>[a-z]+);', _resolve_html_entity, s)
|
||||
return _entity_re.sub(_resolve_html_entity, s)
|
||||
|
||||
|
||||
def resolve_entities (s):
|
||||
"""resolve both html and 7-bit ASCII entites in s and return result"""
|
||||
return resolve_html_entities(resolve_ascii_entities(s))
|
||||
s = resolve_ascii_entities(s)
|
||||
return resolve_html_entities(s)
|
||||
|
||||
|
||||
def strip_quotes (s):
|
||||
|
|
@ -112,3 +125,4 @@ def strip_quotes (s):
|
|||
(s.startswith('"') and s.endswith('"'))):
|
||||
return s[1:-1]
|
||||
return s
|
||||
|
||||
|
|
|
|||
|
|
@ -36,13 +36,15 @@
|
|||
if ((b)==NULL) return T_ERROR; \
|
||||
(b)[(n)-1] = '\0'
|
||||
|
||||
/* make python string from tmp_buf and assign it to a */
|
||||
#define PYSTRING_TMP(a) \
|
||||
(a) = PyString_FromString(yyextra->tmp_buf); \
|
||||
if ((a)==NULL) return T_ERROR
|
||||
/* make python unicode string from tmp_buf and assign it to a */
|
||||
#define PYSTRING_TMP(a) { \
|
||||
const char* enc = PyString_AsString(yyextra->encoding); \
|
||||
(a) = PyUnicode_Decode(yyextra->tmp_buf, strlen(yyextra->tmp_buf), enc, "ignore"); \
|
||||
if ((a)==NULL) return T_ERROR; \
|
||||
}
|
||||
|
||||
/* set return value from tmp_buf */
|
||||
#define SETLVAL {\
|
||||
#define SETLVAL { \
|
||||
PyObject* s; \
|
||||
PYSTRING_TMP(s); \
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1); \
|
||||
|
|
@ -50,26 +52,30 @@
|
|||
}
|
||||
|
||||
/* append yytext to tmp_buf */
|
||||
#define APPEND_TO_TMP(n) {\
|
||||
#define APPEND_TO_TMP(n) { \
|
||||
size_t len = strlen(yyextra->tmp_buf) + (n) + 1; \
|
||||
RESIZE_BUF(yyextra->tmp_buf, len); \
|
||||
strlcat(yyextra->tmp_buf, yytext, len); \
|
||||
}
|
||||
|
||||
/* lowercase the tmp_buf */
|
||||
#define LOWER_TMP {\
|
||||
#define LOWER_TMP { \
|
||||
char* p = yyextra->tmp_buf; \
|
||||
while (*p) { *p = tolower(*p); p++; } \
|
||||
}
|
||||
|
||||
/* check for JavaScript or CSS tags; must be before SET_ATTR_LVAL */
|
||||
#define SCRIPT_CHECK \
|
||||
if (strcmp("script", PyString_AS_STRING(yyextra->tmp_tag))==0) \
|
||||
#define SCRIPT_CHECK { \
|
||||
PyObject* tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore"); \
|
||||
if (tagname==NULL) return T_ERROR; \
|
||||
if (strcmp("script", PyString_AsString(tagname))==0) \
|
||||
BEGIN(S_SCRIPT); \
|
||||
else if (strcmp("style", PyString_AS_STRING(yyextra->tmp_tag))==0) \
|
||||
else if (strcmp("style", PyString_AsString(tagname))==0) \
|
||||
BEGIN(S_STYLE); \
|
||||
else \
|
||||
BEGIN(INITIAL)
|
||||
BEGIN(INITIAL); \
|
||||
Py_DECREF(tagname); \
|
||||
}
|
||||
|
||||
/* set return value from tag with attributes */
|
||||
#define SET_ATTR_LVAL \
|
||||
|
|
@ -88,7 +94,7 @@
|
|||
if (strlen(yyextra->tmp_buf) > 0) { \
|
||||
PYSTRING_TMP(yyextra->tmp_attrname); \
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1); \
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs, PyString_AsString(yyextra->tmp_attrname), Py_None)==-1) return T_ERROR; \
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname, Py_None)==-1) return T_ERROR; \
|
||||
Py_DECREF(yyextra->tmp_attrname); \
|
||||
yyextra->tmp_attrname = NULL; \
|
||||
}
|
||||
|
|
@ -683,8 +689,7 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
if (yyextra->tmp_attrval!=NULL) return T_ERROR;
|
||||
Py_INCREF(Py_None);
|
||||
yyextra->tmp_attrval = Py_None;
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs,
|
||||
PyString_AsString(yyextra->tmp_attrname),
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
/*Py_DECREF(yyextra->tmp_attrname);*/
|
||||
/*Py_DECREF(yyextra->tmp_attrval);*/
|
||||
|
|
@ -726,10 +731,10 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs,
|
||||
PyString_AsString(yyextra->tmp_attrname),
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
|
|
@ -752,10 +757,10 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs,
|
||||
PyString_AsString(yyextra->tmp_attrname),
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
|
|
@ -769,10 +774,10 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs,
|
||||
PyString_AsString(yyextra->tmp_attrname),
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
|
|
@ -786,10 +791,10 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_LINE;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs,
|
||||
PyString_AsString(yyextra->tmp_attrname),
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
|
|
@ -807,10 +812,10 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs,
|
||||
PyString_AsString(yyextra->tmp_attrname),
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
|
|
@ -840,10 +845,10 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities,
|
||||
"O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyMapping_SetItemString(yyextra->tmp_attrs,
|
||||
PyString_AsString(yyextra->tmp_attrname),
|
||||
if (PyObject_SetItem(yyextra->tmp_attrs, yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
Py_DECREF(yyextra->tmp_attrname);
|
||||
Py_DECREF(yyextra->tmp_attrval);
|
||||
|
|
|
|||
|
|
@ -53,44 +53,54 @@ class HtmlPrinter (object):
|
|||
|
||||
|
||||
class HtmlPrettyPrinter (object):
|
||||
"""Print out all parsed HTML data"""
|
||||
"""Print out all parsed HTML data in encoded form."""
|
||||
|
||||
def __init__ (self, fd=sys.stdout):
|
||||
def __init__ (self, fd=sys.stdout, encoding="iso8859-1"):
|
||||
"""write to given file descriptor"""
|
||||
self.fd = fd
|
||||
self.encoding = encoding
|
||||
|
||||
def comment (self, data):
|
||||
"""print comment"""
|
||||
data = data.encode(self.encoding, "ignore")
|
||||
self.fd.write("<!--%s-->" % data)
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
"""print start element"""
|
||||
self.fd.write("<%s"%tag.replace("/", ""))
|
||||
tag = tag.encode(self.encoding, "ignore")
|
||||
self.fd.write("<%s" % tag.replace("/", ""))
|
||||
for key, val in attrs.iteritems():
|
||||
key = key.encode(self.encoding, "ignore")
|
||||
if val is None:
|
||||
self.fd.write(" %s"%key)
|
||||
self.fd.write(" %s" % key)
|
||||
else:
|
||||
val = val.encode(self.encoding, "ignore")
|
||||
self.fd.write(" %s=\"%s\"" % (key, quote_attrval(val)))
|
||||
self.fd.write(">")
|
||||
|
||||
def end_element (self, tag):
|
||||
"""print end element"""
|
||||
tag = tag.encode(self.encoding, "ignore")
|
||||
self.fd.write("</%s>" % tag)
|
||||
|
||||
def doctype (self, data):
|
||||
"""print document type"""
|
||||
data = data.encode(self.encoding, "ignore")
|
||||
self.fd.write("<!DOCTYPE%s>" % data)
|
||||
|
||||
def pi (self, data):
|
||||
"""print pi"""
|
||||
data = data.encode(self.encoding, "ignore")
|
||||
self.fd.write("<?%s?>" % data)
|
||||
|
||||
def cdata (self, data):
|
||||
"""print cdata"""
|
||||
self.fd.write("<![CDATA[%s]]>"%data)
|
||||
data = data.encode(self.encoding, "ignore")
|
||||
self.fd.write("<![CDATA[%s]]>" % data)
|
||||
|
||||
def characters (self, data):
|
||||
"""print characters"""
|
||||
data = data.encode(self.encoding, "ignore")
|
||||
self.fd.write(data)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -104,6 +104,7 @@ static PyObject* list_dict;
|
|||
typedef struct {
|
||||
PyObject_HEAD
|
||||
PyObject* handler;
|
||||
PyObject* encoding;
|
||||
UserData* userData;
|
||||
void* scanner;
|
||||
} parser_object;
|
||||
|
|
@ -156,7 +157,7 @@ element: T_WAIT { YYACCEPT; /* wait for more lexer input */ }
|
|||
| T_ELEMENT_START
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -191,14 +192,14 @@ finish_start:
|
|||
| T_ELEMENT_START_END
|
||||
{
|
||||
/* $1 is a PyTuple (<tag>, <attrs>)
|
||||
<tag> is a PyString, <attrs> is a PyDict */
|
||||
<tag> is a PyObject, <attrs> is a PyDict */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
PyObject* tag = PyTuple_GET_ITEM($1, 0);
|
||||
PyObject* attrs = PyTuple_GET_ITEM($1, 1);
|
||||
int error = 0;
|
||||
char* tagname;
|
||||
PyObject* tagname = NULL;
|
||||
if (!tag || !attrs) { error = 1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "start_element")==1) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "start_element");
|
||||
|
|
@ -209,9 +210,11 @@ finish_start:
|
|||
Py_DECREF(result);
|
||||
callback=result=NULL;
|
||||
}
|
||||
tagname = PyString_AS_STRING(tag);
|
||||
/* encode tagname in ASCII, ignoring any unknown chars */
|
||||
tagname = PyUnicode_AsEncodedString(tag, "ascii", "ignore");
|
||||
if (tagname==NULL) { error=1; goto finish_start_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
NO_HTML_END_TAG(PyString_AsString(tagname))) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "end_element");
|
||||
if (callback==NULL) { error=1; goto finish_start_end; }
|
||||
result = PyObject_CallFunction(callback, "O", tag);
|
||||
|
|
@ -227,6 +230,7 @@ finish_start_end:
|
|||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_XDECREF(tag);
|
||||
Py_XDECREF(tagname);
|
||||
Py_XDECREF(attrs);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
|
|
@ -237,14 +241,16 @@ finish_start_end:
|
|||
}
|
||||
| T_ELEMENT_END
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
char* tagname = PyString_AS_STRING($1);
|
||||
/* encode tagname in ASCII, ignoring any unknown chars */
|
||||
PyObject* tagname = PyUnicode_AsEncodedString($1, "ascii", "ignore");
|
||||
if (tagname==NULL) { error=1; goto finish_end; }
|
||||
if (PyObject_HasAttrString(ud->handler, "end_element")==1 &&
|
||||
NO_HTML_END_TAG(tagname)) {
|
||||
NO_HTML_END_TAG(PyString_AsString(tagname))) {
|
||||
callback = PyObject_GetAttrString(ud->handler, "end_element");
|
||||
if (callback==NULL) { error=1; goto finish_end; }
|
||||
result = PyObject_CallFunction(callback, "O", $1);
|
||||
|
|
@ -257,6 +263,7 @@ finish_start_end:
|
|||
finish_end:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(tagname);
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
|
|
@ -268,7 +275,7 @@ finish_end:
|
|||
}
|
||||
| T_COMMENT
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -289,7 +296,7 @@ finish_comment:
|
|||
}
|
||||
| T_PI
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -310,7 +317,7 @@ finish_pi:
|
|||
}
|
||||
| T_CDATA
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -331,7 +338,7 @@ finish_cdata:
|
|||
}
|
||||
| T_DOCTYPE
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
|
|
@ -352,18 +359,21 @@ finish_doctype:
|
|||
}
|
||||
| T_SCRIPT
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
PyObject* script = PyUnicode_DecodeASCII("script", 6, "ignore");
|
||||
if (script==NULL) { error=1; goto finish_script; }
|
||||
CALLBACK(ud, "characters", "O", $1, finish_script);
|
||||
CALLBACK(ud, "end_element", "s", "script", finish_script);
|
||||
CALLBACK(ud, "end_element", "O", script, finish_script);
|
||||
CHECK_ERROR(ud, finish_script);
|
||||
finish_script:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(script);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
|
|
@ -374,18 +384,21 @@ finish_script:
|
|||
}
|
||||
| T_STYLE
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
int error = 0;
|
||||
PyObject* style = PyUnicode_DecodeASCII("style", 5, "ignore");
|
||||
if (style==NULL) { error=1; goto finish_style; }
|
||||
CALLBACK(ud, "characters", "O", $1, finish_style);
|
||||
CALLBACK(ud, "end_element", "s", "style", finish_style);
|
||||
CALLBACK(ud, "end_element", "O", style, finish_style);
|
||||
CHECK_ERROR(ud, finish_style);
|
||||
finish_style:
|
||||
Py_XDECREF(ud->error);
|
||||
ud->error = NULL;
|
||||
Py_XDECREF(callback);
|
||||
Py_XDECREF(style);
|
||||
Py_XDECREF(result);
|
||||
Py_DECREF($1);
|
||||
if (error) {
|
||||
|
|
@ -396,7 +409,7 @@ finish_style:
|
|||
}
|
||||
| T_TEXT
|
||||
{
|
||||
/* $1 is a PyString */
|
||||
/* $1 is a PyUnicode */
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -428,16 +441,15 @@ finish_characters:
|
|||
/* create parser object */
|
||||
static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) {
|
||||
parser_object* self;
|
||||
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL)
|
||||
{
|
||||
if ((self = (parser_object*) type->tp_alloc(type, 0)) == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(Py_None);
|
||||
self->handler = Py_None;
|
||||
/* reset userData */
|
||||
self->userData = PyMem_New(UserData, sizeof(UserData));
|
||||
if (self->userData == NULL)
|
||||
{
|
||||
if (self->userData == NULL) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -463,11 +475,18 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
self->userData->exc_tb = NULL;
|
||||
self->userData->error = NULL;
|
||||
self->scanner = NULL;
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0)
|
||||
{
|
||||
if (htmllexInit(&(self->scanner), self->userData)!=0) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->encoding = PyString_FromString("iso8859-1");
|
||||
if (self->encoding == NULL) {
|
||||
Py_DECREF(self->handler);
|
||||
Py_DECREF(self);
|
||||
return NULL;
|
||||
}
|
||||
self->userData->encoding = self->encoding;
|
||||
return (PyObject*) self;
|
||||
}
|
||||
|
||||
|
|
@ -501,9 +520,9 @@ static int parser_traverse (parser_object* self, visitproc visit, void* arg) {
|
|||
|
||||
/* clear all used subobjects participating in reference cycles */
|
||||
static int parser_clear (parser_object* self) {
|
||||
self->userData->handler = NULL;
|
||||
Py_XDECREF(self->handler);
|
||||
self->handler = NULL;
|
||||
self->userData->handler = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -512,6 +531,9 @@ static int parser_clear (parser_object* self) {
|
|||
static void parser_dealloc (parser_object* self) {
|
||||
htmllexDestroy(self->scanner);
|
||||
parser_clear(self);
|
||||
self->userData->encoding = NULL;
|
||||
Py_XDECREF(self->encoding);
|
||||
self->encoding = NULL;
|
||||
PyMem_Del(self->userData->buf);
|
||||
PyMem_Del(self->userData->tmp_buf);
|
||||
PyMem_Del(self->userData);
|
||||
|
|
@ -570,7 +592,10 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
|
|||
if (strlen(self->userData->buf)) {
|
||||
/* XXX set line, col */
|
||||
int error = 0;
|
||||
PyObject* s = PyString_FromString(self->userData->buf);
|
||||
const char* enc = PyString_AsString(self->encoding);
|
||||
PyObject* s = PyUnicode_Decode(self->userData->buf,
|
||||
strlen(self->userData->buf),
|
||||
enc, "ignore");
|
||||
PyObject* callback = NULL;
|
||||
PyObject* result = NULL;
|
||||
/* reset buffer */
|
||||
|
|
@ -701,6 +726,7 @@ static PyObject* parser_gethandler (parser_object* self, void* closure) {
|
|||
return self->handler;
|
||||
}
|
||||
|
||||
|
||||
static int parser_sethandler (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete parser handler");
|
||||
|
|
@ -709,10 +735,34 @@ static int parser_sethandler (parser_object* self, PyObject* value, void* closur
|
|||
Py_DECREF(self->handler);
|
||||
Py_INCREF(value);
|
||||
self->handler = value;
|
||||
self->userData->handler = self->handler;
|
||||
self->userData->handler = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static PyObject* parser_getencoding (parser_object* self, void* closure) {
|
||||
Py_INCREF(self->encoding);
|
||||
return self->encoding;
|
||||
}
|
||||
|
||||
|
||||
static int parser_setencoding (parser_object* self, PyObject* value, void* closure) {
|
||||
if (value == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
|
||||
return -1;
|
||||
}
|
||||
if (!PyString_Check(value)) {
|
||||
PyErr_SetString(PyExc_TypeError, "encoding must be string");
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(self->encoding);
|
||||
Py_INCREF(value);
|
||||
self->encoding = value;
|
||||
self->userData->encoding = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* type interface */
|
||||
|
||||
static PyMemberDef parser_members[] = {
|
||||
|
|
@ -722,19 +772,21 @@ static PyMemberDef parser_members[] = {
|
|||
static PyGetSetDef parser_getset[] = {
|
||||
{"handler", (getter)parser_gethandler, (setter)parser_sethandler,
|
||||
"handler object", NULL},
|
||||
{"encoding", (getter)parser_getencoding, (setter)parser_setencoding,
|
||||
"encoding", NULL},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyMethodDef parser_methods[] = {
|
||||
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
|
||||
{"feed", (PyCFunction)parser_feed, METH_VARARGS, "feed data to parse incremental"},
|
||||
{"reset", (PyCFunction)parser_reset, METH_VARARGS, "reset the parser (no flushing)"},
|
||||
{"flush", (PyCFunction)parser_flush, METH_VARARGS, "flush parser buffers"},
|
||||
{"debug", (PyCFunction)parser_debug, METH_VARARGS, "set debug level"},
|
||||
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
|
||||
{"lineno", (PyCFunction)parser_lineno, METH_VARARGS, "get the current line number"},
|
||||
{"last_lineno", (PyCFunction)parser_last_lineno, METH_VARARGS, "get the last line number"},
|
||||
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
|
||||
{"column", (PyCFunction)parser_column, METH_VARARGS, "get the current column"},
|
||||
{"last_column", (PyCFunction)parser_last_column, METH_VARARGS, "get the last column"},
|
||||
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
|
||||
{"pos", (PyCFunction)parser_pos, METH_VARARGS, "get the current scanner position"},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -78,6 +78,8 @@ typedef struct {
|
|||
PyObject* exc_tb;
|
||||
/* error string */
|
||||
PyObject* error;
|
||||
/* encoding string (default iso8859-1) */
|
||||
PyObject* encoding;
|
||||
} UserData;
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in a new issue