From bc99dc51de556d5812d7ed2a72d6e98228b44b92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= <petr.dlouhy@email.cz>
Date: Thu, 18 Apr 2019 19:35:16 +0100
Subject: [PATCH] Python3: fix HtmlParser

---
 linkcheck/HtmlParser/htmllex.c   |  6 ++--
 linkcheck/HtmlParser/htmllex.l   |  6 ++--
 linkcheck/HtmlParser/htmlparse.c | 53 ++++++++++++++++----------------
 linkcheck/HtmlParser/htmlparse.y | 53 ++++++++++++++++----------------
 linkcheck/checker/mailtourl.py   |  4 +--
 5 files changed, 62 insertions(+), 60 deletions(-)

diff --git a/linkcheck/HtmlParser/htmllex.c b/linkcheck/HtmlParser/htmllex.c
index 4807c4db..a9caf4ea 100644
--- a/linkcheck/HtmlParser/htmllex.c
+++ b/linkcheck/HtmlParser/htmllex.c
@@ -2658,7 +2658,7 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
     PyObject* pencoding; \
     char* encoding; \
     CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \
-    encoding = PyString_AsString(pencoding); \
+    encoding = PyBytes_AsString(pencoding); \
     if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
     (a) = PyUnicode_Decode(yyextra->tmp_buf, \
                            (Py_ssize_t)strlen(yyextra->tmp_buf),  \
@@ -2704,9 +2704,9 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
 #define SCRIPT_CHECK { \
     PyObject* tagname; \
     CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \
-    if (strcmp("script", PyString_AsString(tagname))==0) \
+    if (strcmp("script", PyBytes_AsString(tagname))==0) \
 	BEGIN(S_SCRIPT); \
-    else if (strcmp("style", PyString_AsString(tagname))==0) \
+    else if (strcmp("style", PyBytes_AsString(tagname))==0) \
         BEGIN(S_STYLE); \
     else \
 	BEGIN(INITIAL); \
diff --git a/linkcheck/HtmlParser/htmllex.l b/linkcheck/HtmlParser/htmllex.l
index 5698b6a9..1676a60d 100644
--- a/linkcheck/HtmlParser/htmllex.l
+++ b/linkcheck/HtmlParser/htmllex.l
@@ -55,7 +55,7 @@
     PyObject* pencoding; \
     char* encoding; \
     CHECK_NULL(pencoding = PyObject_GetAttrString(yyextra->parser, "encoding")); \
-    encoding = PyString_AsString(pencoding); \
+    encoding = PyBytes_AsString(pencoding); \
     if (encoding==NULL) { Py_DECREF(pencoding); return T_ERROR; } \
     (a) = PyUnicode_Decode(yyextra->tmp_buf, \
                            (Py_ssize_t)strlen(yyextra->tmp_buf),  \
@@ -101,9 +101,9 @@
 #define SCRIPT_CHECK { \
     PyObject* tagname; \
     CHECK_NULL(tagname = PyUnicode_AsEncodedString(yyextra->tmp_tag, "ascii", "ignore")); \
-    if (strcmp("script", PyString_AsString(tagname))==0) \
+    if (strcmp("script", PyBytes_AsString(tagname))==0) \
 	BEGIN(S_SCRIPT); \
-    else if (strcmp("style", PyString_AsString(tagname))==0) \
+    else if (strcmp("style", PyBytes_AsString(tagname))==0) \
         BEGIN(S_STYLE); \
     else \
 	BEGIN(INITIAL); \
diff --git a/linkcheck/HtmlParser/htmlparse.c b/linkcheck/HtmlParser/htmlparse.c
index 02b94ab8..c9386988 100644
--- a/linkcheck/HtmlParser/htmlparse.c
+++ b/linkcheck/HtmlParser/htmlparse.c
@@ -180,9 +180,9 @@ typedef struct {
     PyObject_HEAD
     /* the handler object */
     PyObject* handler;
-    /* the charset encoding (PyStringObject) */
+    /* the charset encoding (PyBytesObject) */
     PyObject* encoding;
-    /* the document type (PyStringObject) */
+    /* the document type (PyBytesObject) */
     PyObject* doctype;
     UserData* userData;
     void* scanner;
@@ -204,11 +204,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) {
     int ret = 1;
     pdoctype = PyObject_GetAttrString(parser, "doctype");
     CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
-    doctype = PyString_AsString(pdoctype);
+    doctype = PyBytes_AsString(pdoctype);
     CHECK_ERROR((doctype == NULL), finish_html_end_tag);
     /* check for HTML (else it's presumably XHTML) */
     if (strcmp(doctype, "HTML") == 0) {
-        char* tag = PyString_AsString(ptag);
+        char* tag = PyBytes_AsString(ptag);
         CHECK_ERROR((tag == NULL), finish_html_end_tag);
         ret = strcmp(tag, "area")!=0 &&
             strcmp(tag, "base")!=0 &&
@@ -2002,13 +2002,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
         Py_DECREF(self);
         return NULL;
     }
-    self->encoding = PyString_FromString("iso8859-1");
+    self->encoding = PyBytes_FromString("iso8859-1");
     if (self->encoding == NULL) {
         Py_DECREF(self->handler);
         Py_DECREF(self);
         return NULL;
     }
-    self->doctype = PyString_FromString("HTML");
+    self->doctype = PyBytes_FromString("HTML");
     if (self->doctype == NULL) {
         Py_DECREF(self->encoding);
         Py_DECREF(self->handler);
@@ -2128,7 +2128,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
             }
             else ++(self->userData->column);
         }
-        enc = PyString_AsString(self->encoding);
+        enc = PyBytes_AsString(self->encoding);
         s = PyUnicode_Decode(self->userData->buf,
                (Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
         /* reset buffer */
@@ -2223,12 +2223,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) {
     }
     buflen = strlen(self->userData->buf);
     if (!buflen || self->userData->bufpos >= buflen) {
-        return PyString_FromString("");
+        return PyBytes_FromString("");
     }
     if (self->userData->bufpos + len >= buflen) {
         len = buflen - self->userData->bufpos - 1;
     }
-    return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
+    return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
 }
 
 
@@ -2309,7 +2309,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
         PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
         return -1;
     }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
         PyErr_SetString(PyExc_TypeError, "encoding must be string");
         return -1;
     }
@@ -2322,7 +2322,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
         if (repr == NULL) {
             return -1;
         }
-        fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr));
+        fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
         Py_DECREF(repr);
     }
     return 0;
@@ -2342,9 +2342,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur
         PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
         return -1;
     }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
         PyObject* repr = PyObject_Repr(value);
-        char* cp = PyString_AsString(repr);
+        char* cp = PyBytes_AsString(repr);
         if (NULL == cp)
             return -1;
         PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
@@ -2460,35 +2460,36 @@ MOD_INIT(htmlsax) {
         /* init error */
         PyErr_Print();
     }
-    if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
+    PyObject* h = NULL;
+    if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
         return MOD_ERROR_VAL;
     }
-    if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) {
-        Py_DECREF(m);
+    if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) {
+    if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
         Py_DECREF(resolve_entities);
-        Py_DECREF(m);
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) {
+    if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
         Py_DECREF(resolve_entities);
         Py_DECREF(set_encoding);
-        Py_DECREF(m);
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    Py_DECREF(m);
-    if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) {
+    Py_DECREF(h);
+    if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
         return MOD_ERROR_VAL;
     }
-    if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) {
+    if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
         return MOD_ERROR_VAL;
     }
-    if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) {
-        Py_DECREF(m);
+    if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    Py_DECREF(m);
+    Py_DECREF(h);
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/linkcheck/HtmlParser/htmlparse.y b/linkcheck/HtmlParser/htmlparse.y
index 0f22e10e..4fec7b16 100644
--- a/linkcheck/HtmlParser/htmlparse.y
+++ b/linkcheck/HtmlParser/htmlparse.y
@@ -115,9 +115,9 @@ typedef struct {
     PyObject_HEAD
     /* the handler object */
     PyObject* handler;
-    /* the charset encoding (PyStringObject) */
+    /* the charset encoding (PyBytesObject) */
     PyObject* encoding;
-    /* the document type (PyStringObject) */
+    /* the document type (PyBytesObject) */
     PyObject* doctype;
     UserData* userData;
     void* scanner;
@@ -139,11 +139,11 @@ static int html_end_tag (PyObject* ptag, PyObject* parser) {
     int ret = 1;
     pdoctype = PyObject_GetAttrString(parser, "doctype");
     CHECK_ERROR((pdoctype == NULL), finish_html_end_tag);
-    doctype = PyString_AsString(pdoctype);
+    doctype = PyBytes_AsString(pdoctype);
     CHECK_ERROR((doctype == NULL), finish_html_end_tag);
     /* check for HTML (else it's presumably XHTML) */
     if (strcmp(doctype, "HTML") == 0) {
-        char* tag = PyString_AsString(ptag);
+        char* tag = PyBytes_AsString(ptag);
         CHECK_ERROR((tag == NULL), finish_html_end_tag);
         ret = strcmp(tag, "area")!=0 &&
             strcmp(tag, "base")!=0 &&
@@ -530,13 +530,13 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
         Py_DECREF(self);
         return NULL;
     }
-    self->encoding = PyString_FromString("iso8859-1");
+    self->encoding = PyBytes_FromString("iso8859-1");
     if (self->encoding == NULL) {
         Py_DECREF(self->handler);
         Py_DECREF(self);
         return NULL;
     }
-    self->doctype = PyString_FromString("HTML");
+    self->doctype = PyBytes_FromString("HTML");
     if (self->doctype == NULL) {
         Py_DECREF(self->encoding);
         Py_DECREF(self->handler);
@@ -656,7 +656,7 @@ static PyObject* parser_flush (parser_object* self, PyObject* args) {
             }
             else ++(self->userData->column);
         }
-        enc = PyString_AsString(self->encoding);
+        enc = PyBytes_AsString(self->encoding);
         s = PyUnicode_Decode(self->userData->buf,
                (Py_ssize_t)strlen(self->userData->buf), enc, "ignore");
         /* reset buffer */
@@ -751,12 +751,12 @@ static PyObject* parser_peek (parser_object* self, PyObject* args) {
     }
     buflen = strlen(self->userData->buf);
     if (!buflen || self->userData->bufpos >= buflen) {
-        return PyString_FromString("");
+        return PyBytes_FromString("");
     }
     if (self->userData->bufpos + len >= buflen) {
         len = buflen - self->userData->bufpos - 1;
     }
-    return PyString_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
+    return PyBytes_FromStringAndSize(self->userData->buf + self->userData->bufpos, len);
 }
 
 
@@ -837,7 +837,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
         PyErr_SetString(PyExc_TypeError, "Cannot delete encoding");
         return -1;
     }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
         PyErr_SetString(PyExc_TypeError, "encoding must be string");
         return -1;
     }
@@ -850,7 +850,7 @@ static int parser_setencoding (parser_object* self, PyObject* value, void* closu
         if (repr == NULL) {
             return -1;
         }
-        fprintf(stderr, "htmlsax: set encoding to %s\n", PyString_AsString(repr));
+        fprintf(stderr, "htmlsax: set encoding to %s\n", PyBytes_AsString(repr));
         Py_DECREF(repr);
     }
     return 0;
@@ -870,9 +870,9 @@ static int parser_setdoctype (parser_object* self, PyObject* value, void* closur
         PyErr_SetString(PyExc_TypeError, "Cannot delete doctype");
         return -1;
     }
-    if (!PyString_CheckExact(value)) {
+    if (!PyBytes_CheckExact(value)) {
         PyObject* repr = PyObject_Repr(value);
-        char* cp = PyString_AsString(repr);
+        char* cp = PyBytes_AsString(repr);
         if (NULL == cp)
             return -1;
         PyErr_Format(PyExc_TypeError, "doctype %s must be string", cp);
@@ -988,35 +988,36 @@ MOD_INIT(htmlsax) {
         /* init error */
         PyErr_Print();
     }
-    if ((m = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
+    PyObject* h = NULL;
+    if ((h = PyImport_ImportModule("linkcheck.HtmlParser")) == NULL) {
         return MOD_ERROR_VAL;
     }
-    if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities")) == NULL) {
-        Py_DECREF(m);
+    if ((resolve_entities = PyObject_GetAttrString(h, "resolve_entities")) == NULL) {
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    if ((set_encoding = PyObject_GetAttrString(m, "set_encoding")) == NULL) {
+    if ((set_encoding = PyObject_GetAttrString(h, "set_encoding")) == NULL) {
         Py_DECREF(resolve_entities);
-        Py_DECREF(m);
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    if ((set_doctype = PyObject_GetAttrString(m, "set_doctype")) == NULL) {
+    if ((set_doctype = PyObject_GetAttrString(h, "set_doctype")) == NULL) {
         Py_DECREF(resolve_entities);
         Py_DECREF(set_encoding);
-        Py_DECREF(m);
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    Py_DECREF(m);
-    if ((u_meta = PyString_Decode("meta", 4, "ascii", "ignore")) == NULL) {
+    Py_DECREF(h);
+    if ((u_meta = PyUnicode_FromStringAndSize("meta", 4)) == NULL) {
         return MOD_ERROR_VAL;
     }
-    if ((m = PyImport_ImportModule("linkcheck.containers")) == NULL) {
+    if ((h = PyImport_ImportModule("linkcheck.containers")) == NULL) {
         return MOD_ERROR_VAL;
     }
-    if ((list_dict = PyObject_GetAttrString(m, "ListDict")) == NULL) {
-        Py_DECREF(m);
+    if ((list_dict = PyObject_GetAttrString(h, "ListDict")) == NULL) {
+        Py_DECREF(h);
         return MOD_ERROR_VAL;
     }
-    Py_DECREF(m);
+    Py_DECREF(h);
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/linkcheck/checker/mailtourl.py b/linkcheck/checker/mailtourl.py
index 36f56ca4..9d36f32d 100644
--- a/linkcheck/checker/mailtourl.py
+++ b/linkcheck/checker/mailtourl.py
@@ -57,8 +57,8 @@ def is_literal (domain):
     return domain.startswith(u'[') and domain.endswith(u']')
 
 
-_remove_quoted = re.compile(ur'\\.').sub
-_quotes = re.compile(ur'["\\]')
+_remove_quoted = re.compile(r'\\.').sub
+_quotes = re.compile(r'["\\]')
 def is_missing_quote (addr):
     """Return True iff mail address is not correctly quoted."""
     return _quotes.match(_remove_quoted(u"", addr[1:-1]))