Merge pull request #213 from cjmayo/python3_03

{python3_03} Python3: fix HtmlParser
2026-05-26 23:04:00 +00:00 · 2019-04-24 10:55:24 -04:00 · 2019-04-24 10:55:24 -04:00 · 73e57de3fd
commit 73e57de3fd
parent 5d26d2d93e bc99dc51de
5 changed files with 62 additions and 60 deletions
--- a/linkcheck/HtmlParser/htmllex.c
+++ b/linkcheck/HtmlParser/htmllex.c
@ -2658,7 +2658,7 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
    PyObject* pencoding; \
    char* encoding; \
    CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \
-    encoding = PyString_AsString(pencoding); \
+    encoding = PyBytes_AsString(pencoding); \
    if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
    (a) = PyUnicode_Decode(yyextra->tmp_buf, \
                           (Py_ssize_t)strlen(yyextra->tmp_buf),  \
@ -2704,9 +2704,9 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
 #define SCRIPT_CHECK { \
    PyObject* tagname; \
    CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \
-    if (strcmp("script", PyString_AsString(tagname))==0) \
+    if (strcmp("script", PyBytes_AsString(tagname))==0) \
 	BEGIN(S_SCRIPT); \
-    else if (strcmp("style", PyString_AsString(tagname))==0) \
+    else if (strcmp("style", PyBytes_AsString(tagname))==0) \
        BEGIN(S_STYLE); \
    else \
 	BEGIN(INITIAL); \
--- a/linkcheck/HtmlParser/htmllex.l
+++ b/linkcheck/HtmlParser/htmllex.l
@ -55,7 +55,7 @@
    PyObject* pencoding; \
    char* encoding; \
    CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \
-    encoding = PyString_AsString(pencoding); \
+    encoding = PyBytes_AsString(pencoding); \
    if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
    (a) = PyUnicode_Decode(yyextra->tmp_buf, \
                           (Py_ssize_t)strlen(yyextra->tmp_buf),  \
@ -101,9 +101,9 @@
 #define SCRIPT_CHECK { \
    PyObject* tagname; \
    CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \
-    if (strcmp("script", PyString_AsString(tagname))==0) \
+    if (strcmp("script", PyBytes_AsString(tagname))==0) \
 	BEGIN(S_SCRIPT); \
-    else if (strcmp("style", PyString_AsString(tagname))==0) \
+    else if (strcmp("style", PyBytes_AsString(tagname))==0) \
        BEGIN(S_STYLE); \
    else \
 	BEGIN(INITIAL); \
--- a/linkcheck/HtmlParser/htmlparse.c
+++ b/linkcheck/HtmlParser/htmlparse.c
@ -180,9 +180,9 @@ typedef struct {
    PyObject_HEAD
    /* the handler object */
    PyObject* handler;
-    /* the charset encoding (PyStringObject) */
+    /* the charset encoding (PyBytesObject) */
    PyObject* encoding;
-    /* the document type (PyStringObject) */
+    /* the document type (PyBytesObject) */
    PyObject* doctype;
    UserData* userData;
    void* scanner;
@ -204,11 +204,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) {
    int ret = 1;
    pdoctype = PyObject_GetAttrString(parser, "doctype");
    CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
-    doctype = PyString_AsString(pdoctype);
+    doctype = PyBytes_AsString(pdoctype);
    CHECK_ERROR((doctype == NULL), finish_html_end_tag);
    /* check for HTML (else it's presumably XHTML) */
    if (strcmp(doctype, "HTML") == 0) {
-        char* tag = PyString_AsString(ptag);
+        char* tag = PyBytes_AsString(ptag);
        CHECK_ERROR((tag == NULL), finish_html_end_tag);
        ret = strcmp(tag, "area")!=0 &&
            strcmp(tag, "base")!=0 &&
@ -2002,13 +2002,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
        Py_DECREF(self);
        return NULL;
    }
-    self->encoding = PyString_FromString("iso8859-1");
+    self->encoding = PyBytes_FromString("iso8859-1");
    if (self->encoding == NULL) {
        Py_DECREF(self->handler);
        Py_DECREF(self);
        return NULL;
    }
-    self->doctype = PyString_FromString("HTML");
+    self->doctype = PyBytes_FromString("HTML");
    if (self->doctype == NULL) {
        Py_DECREF(self->encoding);
        Py_DECREF(self->handler);
@ -2128,7 +2128,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
            }
            else ++(self->userData->column);
        }
-        enc = PyString_AsString(self->encoding);
+        enc = PyBytes_AsString(self->encoding);
        s = PyUnicode_Decode(self->userData->buf,
               (Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
        /* reset buffer */
@ -2223,12 +2223,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) {
    }
    buflen = strlen(self->userData->buf);
    if (!buflen || self->userData->bufpos >= buflen) {
-        return PyString_FromString("");
+        return PyBytes_FromString("");
    }
    if (self->userData->bufpos + len >= buflen) {
        len = buflen - self->userData->bufpos - 1;
    }
-    return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
+    return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
 }


@ -2309,7 +2309,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
        PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
        return -1;
    }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
        PyErr_SetString(PyExc_TypeError, "encoding must be string");
        return -1;
    }
@ -2322,7 +2322,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
        if (repr == NULL) {
            return -1;
        }
-        fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr));
+        fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
        Py_DECREF(repr);
    }
    return 0;
@ -2342,9 +2342,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur
        PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
        return -1;
    }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
        PyObject* repr = PyObject_Repr(value);
-        char* cp = PyString_AsString(repr);
+        char* cp = PyBytes_AsString(repr);
        if (NULL == cp)
            return -1;
        PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
@ -2460,35 +2460,36 @@ MOD_INIT(htmlsax) {
        /* init error */
        PyErr_Print();
    }
-    if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
+    PyObject* h = NULL;
+    if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
        return MOD_ERROR_VAL;
    }
-    if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) {
-        Py_DECREF(m);
+    if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) {
+    if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
        Py_DECREF(resolve_entities);
-        Py_DECREF(m);
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) {
+    if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
        Py_DECREF(resolve_entities);
        Py_DECREF(set_encoding);
-        Py_DECREF(m);
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    Py_DECREF(m);
-    if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) {
+    Py_DECREF(h);
+    if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
        return MOD_ERROR_VAL;
    }
-    if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) {
+    if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
        return MOD_ERROR_VAL;
    }
-    if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) {
-        Py_DECREF(m);
+    if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    Py_DECREF(m);
+    Py_DECREF(h);
    return MOD_SUCCESS_VAL(m);
 }
--- a/linkcheck/HtmlParser/htmlparse.y
+++ b/linkcheck/HtmlParser/htmlparse.y
@ -115,9 +115,9 @@ typedef struct {
    PyObject_HEAD
    /* the handler object */
    PyObject* handler;
-    /* the charset encoding (PyStringObject) */
+    /* the charset encoding (PyBytesObject) */
    PyObject* encoding;
-    /* the document type (PyStringObject) */
+    /* the document type (PyBytesObject) */
    PyObject* doctype;
    UserData* userData;
    void* scanner;
@ -139,11 +139,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) {
    int ret = 1;
    pdoctype = PyObject_GetAttrString(parser, "doctype");
    CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
-    doctype = PyString_AsString(pdoctype);
+    doctype = PyBytes_AsString(pdoctype);
    CHECK_ERROR((doctype == NULL), finish_html_end_tag);
    /* check for HTML (else it's presumably XHTML) */
    if (strcmp(doctype, "HTML") == 0) {
-        char* tag = PyString_AsString(ptag);
+        char* tag = PyBytes_AsString(ptag);
        CHECK_ERROR((tag == NULL), finish_html_end_tag);
        ret = strcmp(tag, "area")!=0 &&
            strcmp(tag, "base")!=0 &&
@ -530,13 +530,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
        Py_DECREF(self);
        return NULL;
    }
-    self->encoding = PyString_FromString("iso8859-1");
+    self->encoding = PyBytes_FromString("iso8859-1");
    if (self->encoding == NULL) {
        Py_DECREF(self->handler);
        Py_DECREF(self);
        return NULL;
    }
-    self->doctype = PyString_FromString("HTML");
+    self->doctype = PyBytes_FromString("HTML");
    if (self->doctype == NULL) {
        Py_DECREF(self->encoding);
        Py_DECREF(self->handler);
@ -656,7 +656,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
            }
            else ++(self->userData->column);
        }
-        enc = PyString_AsString(self->encoding);
+        enc = PyBytes_AsString(self->encoding);
        s = PyUnicode_Decode(self->userData->buf,
               (Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
        /* reset buffer */
@ -751,12 +751,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) {
    }
    buflen = strlen(self->userData->buf);
    if (!buflen || self->userData->bufpos >= buflen) {
-        return PyString_FromString("");
+        return PyBytes_FromString("");
    }
    if (self->userData->bufpos + len >= buflen) {
        len = buflen - self->userData->bufpos - 1;
    }
-    return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
+    return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
 }


@ -837,7 +837,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
        PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
        return -1;
    }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
        PyErr_SetString(PyExc_TypeError, "encoding must be string");
        return -1;
    }
@ -850,7 +850,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
        if (repr == NULL) {
            return -1;
        }
-        fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr));
+        fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
        Py_DECREF(repr);
    }
    return 0;
@ -870,9 +870,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur
        PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
        return -1;
    }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
        PyObject* repr = PyObject_Repr(value);
-        char* cp = PyString_AsString(repr);
+        char* cp = PyBytes_AsString(repr);
        if (NULL == cp)
            return -1;
        PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
@ -988,35 +988,36 @@ MOD_INIT(htmlsax) {
        /* init error */
        PyErr_Print();
    }
-    if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
+    PyObject* h = NULL;
+    if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
        return MOD_ERROR_VAL;
    }
-    if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) {
-        Py_DECREF(m);
+    if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) {
+    if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
        Py_DECREF(resolve_entities);
-        Py_DECREF(m);
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) {
+    if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
        Py_DECREF(resolve_entities);
        Py_DECREF(set_encoding);
-        Py_DECREF(m);
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    Py_DECREF(m);
-    if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) {
+    Py_DECREF(h);
+    if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
        return MOD_ERROR_VAL;
    }
-    if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) {
+    if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
        return MOD_ERROR_VAL;
    }
-    if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) {
-        Py_DECREF(m);
+    if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
+        Py_DECREF(h);
        return MOD_ERROR_VAL;
    }
-    Py_DECREF(m);
+    Py_DECREF(h);
    return MOD_SUCCESS_VAL(m);
 }
--- a/linkcheck/checker/mailtourl.py
+++ b/linkcheck/checker/mailtourl.py
@ -57,8 +57,8 @@ def is_literal (domain):
    return domain.startswith(u'[') and domain.endswith(u']')


-_remove_quoted = re.compile(ur'\\.').sub
-_quotes = re.compile(ur'["\\]')
+_remove_quoted = re.compile(r'\\.').sub
+_quotes = re.compile(r'["\\]')
 def is_missing_quote (addr):
    """Return True iff mail address is not correctly quoted."""
    return _quotes.match(_remove_quoted(u"", addr[1:-1]))