From bc99dc51de556d5812d7ed2a72d6e98228b44b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Thu, 18 Apr 2019 19:35:16 +0100 Subject: [PATCH] Python3: fix HtmlParser --- linkcheck/HtmlParser/htmllex.c | 6 ++-- linkcheck/HtmlParser/htmllex.l | 6 ++-- linkcheck/HtmlParser/htmlparse.c | 53 ++++++++++++++++---------------- linkcheck/HtmlParser/htmlparse.y | 53 ++++++++++++++++---------------- linkcheck/checker/mailtourl.py | 4 +-- 5 files changed, 62 insertions(+), 60 deletions(-) diff --git a/linkcheck/HtmlParser/htmllex.c b/linkcheck/HtmlParser/htmllex.c index 4807c4db..a9caf4ea 100644 --- a/linkcheck/HtmlParser/htmllex.c +++ b/linkcheck/HtmlParser/htmllex.c @@ -2658,7 +2658,7 @@ static yyconst flex_int32_t yy_rule_linenum[131] = PyObject* pencoding; \ char* encoding; \ CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \ - encoding = PyString_AsString(pencoding); \ + encoding = PyBytes_AsString(pencoding); \ if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \ (a) = PyUnicode_Decode(yyextra->tmp_buf, \ (Py_ssize_t)strlen(yyextra->tmp_buf), \ @@ -2704,9 +2704,9 @@ static yyconst flex_int32_t yy_rule_linenum[131] = #define SCRIPT_CHECK { \ PyObject* tagname; \ CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \ - if (strcmp("script", PyString_AsString(tagname))==0) \ + if (strcmp("script", PyBytes_AsString(tagname))==0) \ BEGIN(S_SCRIPT); \ - else if (strcmp("style", PyString_AsString(tagname))==0) \ + else if (strcmp("style", PyBytes_AsString(tagname))==0) \ BEGIN(S_STYLE); \ else \ BEGIN(INITIAL); \ diff --git a/linkcheck/HtmlParser/htmllex.l b/linkcheck/HtmlParser/htmllex.l index 5698b6a9..1676a60d 100644 --- a/linkcheck/HtmlParser/htmllex.l +++ b/linkcheck/HtmlParser/htmllex.l @@ -55,7 +55,7 @@ PyObject* pencoding; \ char* encoding; \ CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \ - encoding = PyString_AsString(pencoding); \ + encoding = PyBytes_AsString(pencoding); \ if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \ (a) = PyUnicode_Decode(yyextra->tmp_buf, \ (Py_ssize_t)strlen(yyextra->tmp_buf), \ @@ -101,9 +101,9 @@ #define SCRIPT_CHECK { \ PyObject* tagname; \ CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \ - if (strcmp("script", PyString_AsString(tagname))==0) \ + if (strcmp("script", PyBytes_AsString(tagname))==0) \ BEGIN(S_SCRIPT); \ - else if (strcmp("style", PyString_AsString(tagname))==0) \ + else if (strcmp("style", PyBytes_AsString(tagname))==0) \ BEGIN(S_STYLE); \ else \ BEGIN(INITIAL); \ diff --git a/linkcheck/HtmlParser/htmlparse.c b/linkcheck/HtmlParser/htmlparse.c index 02b94ab8..c9386988 100644 --- a/linkcheck/HtmlParser/htmlparse.c +++ b/linkcheck/HtmlParser/htmlparse.c @@ -180,9 +180,9 @@ typedef struct { PyObject_HEAD /* the handler object */ PyObject* handler; - /* the charset encoding (PyStringObject) */ + /* the charset encoding (PyBytesObject) */ PyObject* encoding; - /* the document type (PyStringObject) */ + /* the document type (PyBytesObject) */ PyObject* doctype; UserData* userData; void* scanner; @@ -204,11 +204,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) { int ret = 1; pdoctype = PyObject_GetAttrString(parser, "doctype"); CHECK_ERROR((pdoctype == NULL), finish_html_end_tag); - doctype = PyString_AsString(pdoctype); + doctype = PyBytes_AsString(pdoctype); CHECK_ERROR((doctype == NULL), finish_html_end_tag); /* check for HTML (else it's presumably XHTML) */ if (strcmp(doctype, "HTML") == 0) { - char* tag = PyString_AsString(ptag); + char* tag = PyBytes_AsString(ptag); CHECK_ERROR((tag == NULL), finish_html_end_tag); ret = strcmp(tag, "area")!=0 && strcmp(tag, "base")!=0 && @@ -2002,13 +2002,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) Py_DECREF(self); return NULL; } - self->encoding = PyString_FromString("iso8859-1"); + self->encoding = PyBytes_FromString("iso8859-1"); if (self->encoding == NULL) { Py_DECREF(self->handler); Py_DECREF(self); return NULL; } - self->doctype = PyString_FromString("HTML"); + self->doctype = PyBytes_FromString("HTML"); if (self->doctype == NULL) { Py_DECREF(self->encoding); Py_DECREF(self->handler); @@ -2128,7 +2128,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) { } else ++(self->userData->column); } - enc = PyString_AsString(self->encoding); + enc = PyBytes_AsString(self->encoding); s = PyUnicode_Decode(self->userData->buf, (Py_ssize_t)strlen(self->userData->buf), enc, "ignore"); /* reset buffer */ @@ -2223,12 +2223,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) { } buflen = strlen(self->userData->buf); if (!buflen || self->userData->bufpos >= buflen) { - return PyString_FromString(""); + return PyBytes_FromString(""); } if (self->userData->bufpos + len >= buflen) { len = buflen - self->userData->bufpos - 1; } - return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len); + return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len); } @@ -2309,7 +2309,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu PyErr_SetString(PyExc_TypeError, "Cannot delete encoding"); return -1; } - if (!PyString_CheckExact(value)) { + if (!PyBytes_CheckExact(value)) { PyErr_SetString(PyExc_TypeError, "encoding must be string"); return -1; } @@ -2322,7 +2322,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu if (repr == NULL) { return -1; } - fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr)); + fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr)); Py_DECREF(repr); } return 0; @@ -2342,9 +2342,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur PyErr_SetString(PyExc_TypeError, "Cannot delete doctype"); return -1; } - if (!PyString_CheckExact(value)) { + if (!PyBytes_CheckExact(value)) { PyObject* repr = PyObject_Repr(value); - char* cp = PyString_AsString(repr); + char* cp = PyBytes_AsString(repr); if (NULL == cp) return -1; PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp); @@ -2460,35 +2460,36 @@ MOD_INIT(htmlsax) { /* init error */ PyErr_Print(); } - if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) { + PyObject* h = NULL; + if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) { return MOD_ERROR_VAL; } - if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) { - Py_DECREF(m); + if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) { + Py_DECREF(h); return MOD_ERROR_VAL; } - if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) { + if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) { Py_DECREF(resolve_entities); - Py_DECREF(m); + Py_DECREF(h); return MOD_ERROR_VAL; } - if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) { + if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) { Py_DECREF(resolve_entities); Py_DECREF(set_encoding); - Py_DECREF(m); + Py_DECREF(h); return MOD_ERROR_VAL; } - Py_DECREF(m); - if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) { + Py_DECREF(h); + if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) { return MOD_ERROR_VAL; } - if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) { + if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) { return MOD_ERROR_VAL; } - if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) { - Py_DECREF(m); + if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) { + Py_DECREF(h); return MOD_ERROR_VAL; } - Py_DECREF(m); + Py_DECREF(h); return MOD_SUCCESS_VAL(m); } diff --git a/linkcheck/HtmlParser/htmlparse.y b/linkcheck/HtmlParser/htmlparse.y index 0f22e10e..4fec7b16 100644 --- a/linkcheck/HtmlParser/htmlparse.y +++ b/linkcheck/HtmlParser/htmlparse.y @@ -115,9 +115,9 @@ typedef struct { PyObject_HEAD /* the handler object */ PyObject* handler; - /* the charset encoding (PyStringObject) */ + /* the charset encoding (PyBytesObject) */ PyObject* encoding; - /* the document type (PyStringObject) */ + /* the document type (PyBytesObject) */ PyObject* doctype; UserData* userData; void* scanner; @@ -139,11 +139,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) { int ret = 1; pdoctype = PyObject_GetAttrString(parser, "doctype"); CHECK_ERROR((pdoctype == NULL), finish_html_end_tag); - doctype = PyString_AsString(pdoctype); + doctype = PyBytes_AsString(pdoctype); CHECK_ERROR((doctype == NULL), finish_html_end_tag); /* check for HTML (else it's presumably XHTML) */ if (strcmp(doctype, "HTML") == 0) { - char* tag = PyString_AsString(ptag); + char* tag = PyBytes_AsString(ptag); CHECK_ERROR((tag == NULL), finish_html_end_tag); ret = strcmp(tag, "area")!=0 && strcmp(tag, "base")!=0 && @@ -530,13 +530,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds) Py_DECREF(self); return NULL; } - self->encoding = PyString_FromString("iso8859-1"); + self->encoding = PyBytes_FromString("iso8859-1"); if (self->encoding == NULL) { Py_DECREF(self->handler); Py_DECREF(self); return NULL; } - self->doctype = PyString_FromString("HTML"); + self->doctype = PyBytes_FromString("HTML"); if (self->doctype == NULL) { Py_DECREF(self->encoding); Py_DECREF(self->handler); @@ -656,7 +656,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) { } else ++(self->userData->column); } - enc = PyString_AsString(self->encoding); + enc = PyBytes_AsString(self->encoding); s = PyUnicode_Decode(self->userData->buf, (Py_ssize_t)strlen(self->userData->buf), enc, "ignore"); /* reset buffer */ @@ -751,12 +751,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) { } buflen = strlen(self->userData->buf); if (!buflen || self->userData->bufpos >= buflen) { - return PyString_FromString(""); + return PyBytes_FromString(""); } if (self->userData->bufpos + len >= buflen) { len = buflen - self->userData->bufpos - 1; } - return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len); + return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len); } @@ -837,7 +837,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu PyErr_SetString(PyExc_TypeError, "Cannot delete encoding"); return -1; } - if (!PyString_CheckExact(value)) { + if (!PyBytes_CheckExact(value)) { PyErr_SetString(PyExc_TypeError, "encoding must be string"); return -1; } @@ -850,7 +850,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu if (repr == NULL) { return -1; } - fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr)); + fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr)); Py_DECREF(repr); } return 0; @@ -870,9 +870,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur PyErr_SetString(PyExc_TypeError, "Cannot delete doctype"); return -1; } - if (!PyString_CheckExact(value)) { + if (!PyBytes_CheckExact(value)) { PyObject* repr = PyObject_Repr(value); - char* cp = PyString_AsString(repr); + char* cp = PyBytes_AsString(repr); if (NULL == cp) return -1; PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp); @@ -988,35 +988,36 @@ MOD_INIT(htmlsax) { /* init error */ PyErr_Print(); } - if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) { + PyObject* h = NULL; + if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) { return MOD_ERROR_VAL; } - if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) { - Py_DECREF(m); + if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) { + Py_DECREF(h); return MOD_ERROR_VAL; } - if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) { + if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) { Py_DECREF(resolve_entities); - Py_DECREF(m); + Py_DECREF(h); return MOD_ERROR_VAL; } - if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) { + if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) { Py_DECREF(resolve_entities); Py_DECREF(set_encoding); - Py_DECREF(m); + Py_DECREF(h); return MOD_ERROR_VAL; } - Py_DECREF(m); - if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) { + Py_DECREF(h); + if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) { return MOD_ERROR_VAL; } - if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) { + if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) { return MOD_ERROR_VAL; } - if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) { - Py_DECREF(m); + if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) { + Py_DECREF(h); return MOD_ERROR_VAL; } - Py_DECREF(m); + Py_DECREF(h); return MOD_SUCCESS_VAL(m); } diff --git a/linkcheck/checker/mailtourl.py b/linkcheck/checker/mailtourl.py index 36f56ca4..9d36f32d 100644 --- a/linkcheck/checker/mailtourl.py +++ b/linkcheck/checker/mailtourl.py @@ -57,8 +57,8 @@ def is_literal (domain): return domain.startswith(u'[') and domain.endswith(u']') -_remove_quoted = re.compile(ur'\\.').sub -_quotes = re.compile(ur'["\\]') +_remove_quoted = re.compile(r'\\.').sub +_quotes = re.compile(r'["\\]') def is_missing_quote (addr): """Return True iff mail address is not correctly quoted.""" return _quotes.match(_remove_quoted(u"", addr[1:-1]))