Merge pull request #213 from cjmayo/python3_03

{python3_03} Python3: fix HtmlParser
This commit is contained in:
anarcat 2019-04-24 10:55:24 -04:00 committed by GitHub
commit 73e57de3fd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 62 additions and 60 deletions

View file

@ -2658,7 +2658,7 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
PyObject* pencoding; \
char* encoding; \
CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \
encoding = PyString_AsString(pencoding); \
encoding = PyBytes_AsString(pencoding); \
if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
(a) = PyUnicode_Decode(yyextra->tmp_buf, \
(Py_ssize_t)strlen(yyextra->tmp_buf), \
@ -2704,9 +2704,9 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
#define SCRIPT_CHECK { \
PyObject* tagname; \
CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \
if (strcmp("script", PyString_AsString(tagname))==0) \
if (strcmp("script", PyBytes_AsString(tagname))==0) \
BEGIN(S_SCRIPT); \
else if (strcmp("style", PyString_AsString(tagname))==0) \
else if (strcmp("style", PyBytes_AsString(tagname))==0) \
BEGIN(S_STYLE); \
else \
BEGIN(INITIAL); \

View file

@ -55,7 +55,7 @@
PyObject* pencoding; \
char* encoding; \
CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \
encoding = PyString_AsString(pencoding); \
encoding = PyBytes_AsString(pencoding); \
if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
(a) = PyUnicode_Decode(yyextra->tmp_buf, \
(Py_ssize_t)strlen(yyextra->tmp_buf), \
@ -101,9 +101,9 @@
#define SCRIPT_CHECK { \
PyObject* tagname; \
CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \
if (strcmp("script", PyString_AsString(tagname))==0) \
if (strcmp("script", PyBytes_AsString(tagname))==0) \
BEGIN(S_SCRIPT); \
else if (strcmp("style", PyString_AsString(tagname))==0) \
else if (strcmp("style", PyBytes_AsString(tagname))==0) \
BEGIN(S_STYLE); \
else \
BEGIN(INITIAL); \

View file

@ -180,9 +180,9 @@ typedef struct {
PyObject_HEAD
/* the handler object */
PyObject* handler;
/* the charset encoding (PyStringObject) */
/* the charset encoding (PyBytesObject) */
PyObject* encoding;
/* the document type (PyStringObject) */
/* the document type (PyBytesObject) */
PyObject* doctype;
UserData* userData;
void* scanner;
@ -204,11 +204,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) {
int ret = 1;
pdoctype = PyObject_GetAttrString(parser, "doctype");
CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
doctype = PyString_AsString(pdoctype);
doctype = PyBytes_AsString(pdoctype);
CHECK_ERROR((doctype == NULL), finish_html_end_tag);
/* check for HTML (else it's presumably XHTML) */
if (strcmp(doctype, "HTML") == 0) {
char* tag = PyString_AsString(ptag);
char* tag = PyBytes_AsString(ptag);
CHECK_ERROR((tag == NULL), finish_html_end_tag);
ret = strcmp(tag, "area")!=0 &&
strcmp(tag, "base")!=0 &&
@ -2002,13 +2002,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
Py_DECREF(self);
return NULL;
}
self->encoding = PyString_FromString("iso8859-1");
self->encoding = PyBytes_FromString("iso8859-1");
if (self->encoding == NULL) {
Py_DECREF(self->handler);
Py_DECREF(self);
return NULL;
}
self->doctype = PyString_FromString("HTML");
self->doctype = PyBytes_FromString("HTML");
if (self->doctype == NULL) {
Py_DECREF(self->encoding);
Py_DECREF(self->handler);
@ -2128,7 +2128,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
}
else ++(self->userData->column);
}
enc = PyString_AsString(self->encoding);
enc = PyBytes_AsString(self->encoding);
s = PyUnicode_Decode(self->userData->buf,
(Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
/* reset buffer */
@ -2223,12 +2223,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) {
}
buflen = strlen(self->userData->buf);
if (!buflen || self->userData->bufpos >= buflen) {
return PyString_FromString("");
return PyBytes_FromString("");
}
if (self->userData->bufpos + len >= buflen) {
len = buflen - self->userData->bufpos - 1;
}
return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
}
@ -2309,7 +2309,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
return -1;
}
if (!PyString_CheckExact(value)) {
if (!PyBytes_CheckExact(value)) {
PyErr_SetString(PyExc_TypeError, "encoding must be string");
return -1;
}
@ -2322,7 +2322,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
if (repr == NULL) {
return -1;
}
fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr));
fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
Py_DECREF(repr);
}
return 0;
@ -2342,9 +2342,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
return -1;
}
if (!PyString_CheckExact(value)) {
if (!PyBytes_CheckExact(value)) {
PyObject* repr = PyObject_Repr(value);
char* cp = PyString_AsString(repr);
char* cp = PyBytes_AsString(repr);
if (NULL == cp)
return -1;
PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
@ -2460,35 +2460,36 @@ MOD_INIT(htmlsax) {
/* init error */
PyErr_Print();
}
if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
PyObject* h = NULL;
if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
return MOD_ERROR_VAL;
}
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) {
Py_DECREF(m);
if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
Py_DECREF(h);
return MOD_ERROR_VAL;
}
if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) {
if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(m);
Py_DECREF(h);
return MOD_ERROR_VAL;
}
if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) {
if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(set_encoding);
Py_DECREF(m);
Py_DECREF(h);
return MOD_ERROR_VAL;
}
Py_DECREF(m);
if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) {
Py_DECREF(h);
if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
return MOD_ERROR_VAL;
}
if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) {
if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
return MOD_ERROR_VAL;
}
if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) {
Py_DECREF(m);
if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
Py_DECREF(h);
return MOD_ERROR_VAL;
}
Py_DECREF(m);
Py_DECREF(h);
return MOD_SUCCESS_VAL(m);
}

View file

@ -115,9 +115,9 @@ typedef struct {
PyObject_HEAD
/* the handler object */
PyObject* handler;
/* the charset encoding (PyStringObject) */
/* the charset encoding (PyBytesObject) */
PyObject* encoding;
/* the document type (PyStringObject) */
/* the document type (PyBytesObject) */
PyObject* doctype;
UserData* userData;
void* scanner;
@ -139,11 +139,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) {
int ret = 1;
pdoctype = PyObject_GetAttrString(parser, "doctype");
CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
doctype = PyString_AsString(pdoctype);
doctype = PyBytes_AsString(pdoctype);
CHECK_ERROR((doctype == NULL), finish_html_end_tag);
/* check for HTML (else it's presumably XHTML) */
if (strcmp(doctype, "HTML") == 0) {
char* tag = PyString_AsString(ptag);
char* tag = PyBytes_AsString(ptag);
CHECK_ERROR((tag == NULL), finish_html_end_tag);
ret = strcmp(tag, "area")!=0 &&
strcmp(tag, "base")!=0 &&
@ -530,13 +530,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
Py_DECREF(self);
return NULL;
}
self->encoding = PyString_FromString("iso8859-1");
self->encoding = PyBytes_FromString("iso8859-1");
if (self->encoding == NULL) {
Py_DECREF(self->handler);
Py_DECREF(self);
return NULL;
}
self->doctype = PyString_FromString("HTML");
self->doctype = PyBytes_FromString("HTML");
if (self->doctype == NULL) {
Py_DECREF(self->encoding);
Py_DECREF(self->handler);
@ -656,7 +656,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
}
else ++(self->userData->column);
}
enc = PyString_AsString(self->encoding);
enc = PyBytes_AsString(self->encoding);
s = PyUnicode_Decode(self->userData->buf,
(Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
/* reset buffer */
@ -751,12 +751,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) {
}
buflen = strlen(self->userData->buf);
if (!buflen || self->userData->bufpos >= buflen) {
return PyString_FromString("");
return PyBytes_FromString("");
}
if (self->userData->bufpos + len >= buflen) {
len = buflen - self->userData->bufpos - 1;
}
return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
}
@ -837,7 +837,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
return -1;
}
if (!PyString_CheckExact(value)) {
if (!PyBytes_CheckExact(value)) {
PyErr_SetString(PyExc_TypeError, "encoding must be string");
return -1;
}
@ -850,7 +850,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
if (repr == NULL) {
return -1;
}
fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr));
fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
Py_DECREF(repr);
}
return 0;
@ -870,9 +870,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur
PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
return -1;
}
if (!PyString_CheckExact(value)) {
if (!PyBytes_CheckExact(value)) {
PyObject* repr = PyObject_Repr(value);
char* cp = PyString_AsString(repr);
char* cp = PyBytes_AsString(repr);
if (NULL == cp)
return -1;
PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
@ -988,35 +988,36 @@ MOD_INIT(htmlsax) {
/* init error */
PyErr_Print();
}
if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
PyObject* h = NULL;
if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
return MOD_ERROR_VAL;
}
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) {
Py_DECREF(m);
if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
Py_DECREF(h);
return MOD_ERROR_VAL;
}
if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) {
if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(m);
Py_DECREF(h);
return MOD_ERROR_VAL;
}
if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) {
if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
Py_DECREF(resolve_entities);
Py_DECREF(set_encoding);
Py_DECREF(m);
Py_DECREF(h);
return MOD_ERROR_VAL;
}
Py_DECREF(m);
if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) {
Py_DECREF(h);
if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
return MOD_ERROR_VAL;
}
if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) {
if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
return MOD_ERROR_VAL;
}
if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) {
Py_DECREF(m);
if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
Py_DECREF(h);
return MOD_ERROR_VAL;
}
Py_DECREF(m);
Py_DECREF(h);
return MOD_SUCCESS_VAL(m);
}

View file

@ -57,8 +57,8 @@ def is_literal (domain):
return domain.startswith(u'[') and domain.endswith(u']')
_remove_quoted = re.compile(ur'\\.').sub
_quotes = re.compile(ur'["\\]')
_remove_quoted = re.compile(r'\\.').sub
_quotes = re.compile(r'["\\]')
def is_missing_quote (addr):
"""Return True iff mail address is not correctly quoted."""
return _quotes.match(_remove_quoted(u"", addr[1:-1]))