Remove home-cooked htmlparser and use BeautifulSoup

2026-05-08 14:44:46 +00:00 · 2019-07-22 19:59:37 +01:00 · 2019-07-22 19:59:37 +01:00 · 51a06d8a1e
commit 51a06d8a1e
parent d1844a526e
19 changed files with 124 additions and 11071 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -7,11 +7,6 @@ include cgi-bin/lc.wsgi cgi-bin/README
 include Makefile
 include cgi-bin/lconline/*.html cgi-bin/lconline/*.de cgi-bin/lconline/*.en
 include cgi-bin/lconline/*.js cgi-bin/lconline/*.css cgi-bin/lconline/*.ico
-include linkcheck/HtmlParser/Makefile
-include linkcheck/HtmlParser/htmllex.l
-include linkcheck/HtmlParser/htmlparse.y
-include linkcheck/HtmlParser/*.h
-include linkcheck/HtmlParser/fixincludes.awk
 include po/*.po po/*.mo po/*.pot po/Makefile
 include doc/*.example doc/*.txt
 include doc/html/*.ico
--- a/3
+++ b/3
@ -53,7 +53,6 @@ all:
 clean:
 	-$(PYTHON) setup.py clean --all
 	rm -f $(LAPPNAME)-out.* *-stamp*
-	$(MAKE) -C linkcheck/HtmlParser clean
 	find . -name '*.py[co]' -exec rm -f {} \;
 	find . -name '*.bak' -exec rm -f {} \;
 	find . -depth -name '__pycache__' -exec rm -rf {} \;
@ -75,9 +74,7 @@ locale:

 # to build in the current directory
 localbuild: MANIFEST locale
-	$(MAKE) -C linkcheck/HtmlParser
 	$(PYTHON) setup.py build
-	cp -f build/lib.$(PLATFORM)-$(PYVER)*/linkcheck/HtmlParser/htmlsax*.so linkcheck/HtmlParser

 release: distclean releasecheck filescheck
 	$(MAKE) dist sign register upload homepage tag changelog deb
--- a/linkcheck/HtmlParser/Makefile
+++ b/linkcheck/HtmlParser/Makefile
@ -1,29 +0,0 @@
-# This HTML parser needs flex >= 2.5.xx from http://lex.sf.net/ for
-# reentrant bison parser support and uses features of bison >= 3.0.x
-LEX = flex
-YACC = bison
-PYINCLUDE=-I/usr/include/python2.7
-
-all: htmllex.c htmlparse.c
-
-htmlsax.so: htmllex.o htmlparse.o s_util.o
-	gcc -pthread -shared $^ -o htmlsax.so
-
-%.o:	%.c
-	gcc -g -std=c99 -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. $(PYINCLUDE) -c $< -o $@
-
-htmlparse.h htmlparse.c:	htmlparse.y htmlsax.h
-	$(YACC) --output=htmlparse.c htmlparse.y
-
-htmllex.l:	htmlparse.h
-
-htmllex.c:	htmllex.l htmlsax.h
-	$(LEX) htmllex.l
-	awk -f fixincludes.awk htmllex.c > htmllex.c.fixed; mv -f htmllex.c.fixed htmllex.c
-
-clean:
-	rm -f *.o *.so *.pyc *.pyo *.output
-
-distclean: clean
-	rm -f htmlparse.c htmlparse.h htmllex.c
-
--- a/linkcheck/HtmlParser/init.py
+++ b/linkcheck/HtmlParser/init.py
@ -15,64 +15,7 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
-Fast HTML parser module written in C with the following features:
-
- Reentrant
-  As soon as any HTML string data is available, we try to feed it
-  to the HTML parser. This means that the parser has to scan possible
-  incomplete data, recognizing as much as it can. Incomplete trailing
-  data is saved for subsequent calls, or it is just flushed into the
-  output buffer with the flush() function.
-  A reset() brings the parser back to its initial state, throwing away all
-  buffered data.
-
- Coping with HTML syntax errors
-  The parser recognizes as much as it can and passes the rest
-  of the data as TEXT tokens.
-  The scanner only passes complete recognized HTML syntax elements to
-  the parser. Invalid syntax elements are passed as TEXT. This way we do
-  not need the bison error recovery.
-  Incomplete data is rescanned the next time the parser calls yylex() or
-  when it is being flush()ed.
-
-  The following syntax errors will be recognized correctly:
-
-    - Unquoted attribute values.
-    - Missing beginning quote of attribute values.
-    - Invalid "</...>" end tags in script modus.
-    - Missing ">" in tags.
-    - Invalid characters in tag or attribute names.
-
- The following syntax errors will not be recognized:
-
-    - Missing end quote of attribute values. On the TODO list.
-    - Unknown HTML tag or attribute names.
-    - Invalid nesting of tags.
-
-  Additionally the parser has the following features:
-
-    - NULL bytes are changed into spaces
-    - <!-- ... --> inside a <script> or <style> are not treated as
-       comments but as DATA
-    - Rewrites all tag and attribute names to lowercase for easier
-       matching.
-
- Speed
-  The FLEX code is configured to generate a large but fast scanner.
-  The parser ignores forbidden or unnecessary HTML end tags.
-  The parser converts tag and attribute names to lower case for easier
-  matching.
-  The parser quotes all attribute values.
-  Python memory management interface is used.
-
- Character encoding aware
-  The parser itself is not encoding aware, but output strings are
-  always Python Unicode strings.
-
- Retain HTML attribute order
-  The parser keeps the order in which HTML tag attributes are parsed.
-  The attributes are stored in a custom dictionary class ListDict which
-  iterates over the dictionary keys in insertion order.
+HTML parser module.

 USAGE

--- a/linkcheck/HtmlParser/fixincludes.awk
+++ b/linkcheck/HtmlParser/fixincludes.awk
@ -1,7 +0,0 @@
-# Add htmlsax.h include as first line of file. This is needed to let
-# Python.h be included before any system headers.
-# See also http://docs.python.org/api/includes.html
-BEGIN {
-    print "#include \"htmlsax.h\"";
-}
-{ print; }
--- a/linkcheck/HtmlParser/htmllex.c
+++ b/linkcheck/HtmlParser/htmllex.c
--- a/linkcheck/HtmlParser/htmllex.l
+++ b/linkcheck/HtmlParser/htmllex.l
--- a/linkcheck/HtmlParser/htmllib.py
+++ b/linkcheck/HtmlParser/htmllib.py
@ -87,7 +87,6 @@ class HtmlPrettyPrinter (object):
        @type data: string
        @return: None
        """
-        data = data.encode(self.encoding, "ignore")
        self.fd.write("<!--%s-->" % data)

    def start_element (self, tag, attrs):
@ -102,7 +101,7 @@ class HtmlPrettyPrinter (object):
        """
        self._start_element(tag, attrs, ">")

-    def start_end_element (self, tag, attrs):
+    def start_end_element (self, tag, attrs, element_text=None):
        """
        Print HTML start-end element.

@ -126,14 +125,11 @@ class HtmlPrettyPrinter (object):
        @type end: string
        @return: None
        """
-        tag = tag.encode(self.encoding, "ignore")
        self.fd.write("<%s" % tag.replace("/", ""))
        for key, val in attrs.items():
-            key = key.encode(self.encoding, "ignore")
            if val is None:
                self.fd.write(" %s" % key)
            else:
-                val = val.encode(self.encoding, "ignore")
                self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
        self.fd.write(end)

@ -145,7 +141,6 @@ class HtmlPrettyPrinter (object):
        @type tag: string
        @return: None
        """
-        tag = tag.encode(self.encoding, "ignore")
        self.fd.write("</%s>" % tag)

    def doctype (self, data):
@ -156,7 +151,6 @@ class HtmlPrettyPrinter (object):
        @type data: string
        @return: None
        """
-        data = data.encode(self.encoding, "ignore")
        self.fd.write("<!DOCTYPE%s>" % data)

    def pi (self, data):
@ -167,7 +161,6 @@ class HtmlPrettyPrinter (object):
        @type data: string
        @return: None
        """
-        data = data.encode(self.encoding, "ignore")
        self.fd.write("<?%s?>" % data)

    def cdata (self, data):
@ -178,7 +171,6 @@ class HtmlPrettyPrinter (object):
        @type data: string
        @return: None
        """
-        data = data.encode(self.encoding, "ignore")
        self.fd.write("<![CDATA[%s]]>" % data)

    def characters (self, data):
@ -189,7 +181,6 @@ class HtmlPrettyPrinter (object):
        @type data: string
        @return: None
        """
-        data = data.encode(self.encoding, "ignore")
        self.fd.write(data)


--- a/linkcheck/HtmlParser/htmlparse.c
+++ b/linkcheck/HtmlParser/htmlparse.c
--- a/linkcheck/HtmlParser/htmlparse.h
+++ b/linkcheck/HtmlParser/htmlparse.h
@ -1,74 +0,0 @@
-/* A Bison parser, made by GNU Bison 3.0.4.  */
-
-/* Bison interface for Yacc-like parsers in C
-
-   Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
-
-/* As a special exception, you may create a larger work that contains
-   part or all of the Bison parser skeleton and distribute that work
-   under terms of your choice, so long as that work isn't itself a
-   parser generator using the skeleton or a modified version thereof
-   as a parser skeleton.  Alternatively, if you modify or redistribute
-   the parser skeleton itself, you may (at your option) remove this
-   special exception, which will cause the skeleton and the resulting
-   Bison output files to be licensed under the GNU General Public
-   License without this special exception.
-
-   This special exception was added by the Free Software Foundation in
-   version 2.2 of Bison.  */
-
-#ifndef YY_YY_HTMLPARSE_H_INCLUDED
-# define YY_YY_HTMLPARSE_H_INCLUDED
-/* Debug traces.  */
-#ifndef YYDEBUG
-# define YYDEBUG 1
-#endif
-#if YYDEBUG
-extern int yydebug;
-#endif
-
-/* Token type.  */
-#ifndef YYTOKENTYPE
-# define YYTOKENTYPE
-  enum yytokentype
-  {
-    T_WAIT = 258,
-    T_ERROR = 259,
-    T_TEXT = 260,
-    T_ELEMENT_START = 261,
-    T_ELEMENT_START_END = 262,
-    T_ELEMENT_END = 263,
-    T_SCRIPT = 264,
-    T_STYLE = 265,
-    T_PI = 266,
-    T_COMMENT = 267,
-    T_CDATA = 268,
-    T_DOCTYPE = 269
-  };
-#endif
-
-/* Value type.  */
-#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
-typedef int YYSTYPE;
-# define YYSTYPE_IS_TRIVIAL 1
-# define YYSTYPE_IS_DECLARED 1
-#endif
-
-
-
-int yyparse (PyObject* scanner);
-
-#endif /* !YY_YY_HTMLPARSE_H_INCLUDED  */
--- a/linkcheck/HtmlParser/htmlparse.y
+++ b/linkcheck/HtmlParser/htmlparse.y
--- a/linkcheck/HtmlParser/htmlsax.h
+++ b/linkcheck/HtmlParser/htmlsax.h
@ -1,81 +0,0 @@
-/* Copyright (C) 2000-2014 Bastian Kleineidam
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-/*
- Includes header definitions for the HTML Sax parser Python module.
- */
-#ifndef HTMLSAX_H
-#define HTMLSAX_H
-
-#include "Python.h"
-
-/* require Python >= 2.6 */
-#ifndef PY_VERSION_HEX
-#error please install Python >= 2.6
-#endif
-
-#if PY_VERSION_HEX < 0x02060000
-#error please install Python >= 2.6
-#endif
-
-/* user_data type for SAX calls */
-typedef struct {
-    /* the Python SAX object to issue callbacks */
-    PyObject* handler;
-    /* Buffer to store still-to-be-scanned characters. After recognizing
-     * a complete syntax element, all data up to bufpos will be removed.
-     * Before scanning you should append new data to this buffer.
-     */
-    char* buf;
-    /* current position in the buffer counting from zero */
-    unsigned int bufpos;
-    /* current position of next syntax element */
-    unsigned int nextpos;
-    /* position in the stream of data already seen, counting from zero */
-    unsigned int pos;
-    /* line counter, counting from one */
-    unsigned int lineno;
-    /* column counter, counting from zero */
-    unsigned int column;
-    /* value of line counter before the current token */
-    unsigned int last_lineno;
-    /* value of column counter before the current token */
-    unsigned int last_column;
-    /* input buffer of lexer, must be deleted when the parsing stops */
-    void* lexbuf;
-    /* temporary character buffer */
-    char* tmp_buf;
-    /* temporary HTML start or end tag name */
-    PyObject* tmp_tag;
-    /* temporary HTML start tag attribute name */
-    PyObject* tmp_attrname;
-    /* temporary HTML start tag attribute value */
-    PyObject* tmp_attrval;
-    /* temporary HTML start tag attribute list (a SortedDict) */
-    PyObject* tmp_attrs;
-    /* HtmlParser.resolve_entities */
-    PyObject* resolve_entities;
-    /* HtmlParser.SortedDict */
-    PyObject* list_dict;
-    /* stored Python exception (if error occurred in scanner) */
-    PyObject* exc_type;
-    PyObject* exc_val;
-    PyObject* exc_tb;
-    /* the parser object itself */
-    PyObject* parser;
-} UserData;
-
-#endif
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@ -0,0 +1,120 @@
+# Copyright (C) 2000-2018 Petr Dlouhy
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+HTML parser implemented using Beautiful Soup and html.parser.
+"""
+
+from io import BytesIO, StringIO
+
+from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
+                 Tag)
+
+from ..containers import ListDict
+
+
+class Parser(object):
+    handler = None
+    encoding = None
+
+    def __init__(self, handler):
+        self.handler = handler
+        self.reset()
+
+    def feed(self, feed_text):
+        if not self.html_doc:
+            if isinstance(feed_text, bytes):
+                self.html_doc = BytesIO()
+            else:
+                self.html_doc = StringIO()
+        self.html_doc.write(feed_text)
+
+    def reset(self):
+        self.html_doc = None
+
+    def parse_contents(self, contents):
+        for content in contents:
+            if isinstance(content, Tag):
+                attrs = ListDict()
+                for k, v_list in sorted(content.attrs.items()):
+                    if not isinstance(v_list, list):
+                        v_list = [v_list]
+                    for v in v_list:
+                        # empty parameters returned by BS4
+                        # are sometimes in bytes:
+                        if v == b'':
+                            v = u''
+                        attrs[k] = v
+                if content.is_empty_element:
+                    self.handler.start_end_element(
+                        content.name, attrs, content.text.strip(),
+                    )
+                else:
+                    self.handler.start_element(
+                        content.name, attrs, content.text.strip(),
+                    )
+                    if hasattr(content, 'contents'):  # recursion
+                        self.parse_contents(content.contents)
+                    if hasattr(self.handler, 'end_element'):
+                        self.handler.end_element(content.name)
+                if content.comments:
+                    for comment in content.comments:
+                        if hasattr(self.handler, 'comment'):
+                            self.handler.comment(comment)
+            elif isinstance(content, Doctype):
+                if hasattr(self.handler, 'doctype'):
+                    self.handler.doctype(content[7:])
+            elif isinstance(content, Comment):
+                if hasattr(self.handler, 'comment'):
+                    self.handler.comment(content.strip())
+            elif isinstance(content, CData):
+                if hasattr(self.handler, 'cdata'):
+                    self.handler.cdata(content)
+            elif isinstance(content, ProcessingInstruction):
+                if hasattr(self.handler, 'pi'):
+                    self.handler.pi(content.strip("? "))
+            else:
+                if hasattr(self.handler, 'characters'):
+                    self.handler.characters(content)
+
+    def flush(self):
+        soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
+        if hasattr(soup, 'contents'):
+            self.parse_contents(soup.contents)
+        self.encoding = soup.original_encoding
+
+    def debug(self, text):
+        raise NotImplementedError("debug is not implemented")
+
+    def lineno(self):
+        # It seems, that getting line number of element is not
+        # implemented in BeautifulSoup, so this is faked
+        return 0
+
+    def last_lineno(self):
+        return 0
+
+    def column(self):
+        return 0
+
+    def last_column(self):
+        return 0
+
+    def pos(self, text):
+        return 0
+
+
+def parser(handler=None):
+    return Parser(handler)
--- a/linkcheck/HtmlParser/s_util.c
+++ b/linkcheck/HtmlParser/s_util.c
@ -1,52 +0,0 @@
-/*
- *  linux/lib/string.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#include "Python.h"
-
-#if !defined(HAVE_STRLCPY)
-/**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
- * @dst: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
-size_t strlcpy (char *dst, const char *src, size_t size)
-{
-    size_t ret = strlen(src);
-    if (size > 0) {
-        size_t len = (ret >= size) ? size-1 : ret;
-        Py_MEMCPY(dst, src, len);
-        dst[len] = '\0';
-    }
-    return ret;
-}
-#endif /* !HAVE_STRLCPY */
-
-#if !defined(HAVE_STRLCAT)
-/**
- * strlcat - Append a length-limited, %NUL-terminated string to another
- * @dst: The string to be appended to
- * @src: The string to append to it
- * @size: The size of the destination buffer.
- */
-size_t strlcat (char *dst, const char *src, size_t size)
-{
-    size_t dsize = strlen(dst);
-    size_t len = strlen(src);
-    size_t res = dsize + len;
-    dst += dsize;
-    size -= dsize;
-    if (len >= size)
-        len = size-1;
-    Py_MEMCPY(dst, src, len);
-    dst[len] = '\0';
-    return res;
-}
-#endif /* !HAVE_STRLCAT */
--- a/linkcheck/HtmlParser/s_util.h
+++ b/linkcheck/HtmlParser/s_util.h
@ -1,12 +0,0 @@
-/*
- *  linux/lib/string.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#if !defined(HAVE_STRLCPY)
-size_t strlcpy(char *dst, const char *src, size_t size);
-#endif /* !HAVE_STRLCPY */
-
-#if !defined(HAVE_STRLCAT)
-size_t strlcat(char *dst, const char *src, size_t size);
-#endif /* !HAVE_STRLCAT */
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -115,10 +115,10 @@ class TagFinder (object):
        """Does nothing, override in a subclass."""
        pass

-    def start_end_element (self, tag, attrs):
+    def start_end_element (self, tag, attrs, element_text=None):
        """Delegate a combined start/end element (eg. <br/>) to
        the start_element method. Ignore the end element part."""
-        self.start_element(tag, attrs)
+        self.start_element(tag, attrs, element_text)


 class MetaRobotsFinder (TagFinder):
--- a/setup.py
+++ b/setup.py
@ -466,20 +466,6 @@ args = dict(
        'linkcheck.parser',
        'linkcheck.plugins',
    ],
-    ext_modules = [
-        Extension('linkcheck.HtmlParser.htmlsax',
-            sources = [
-                'linkcheck/HtmlParser/htmllex.c',
-                'linkcheck/HtmlParser/htmlparse.c',
-                'linkcheck/HtmlParser/s_util.c',
-            ],
-            extra_compile_args = extra_compile_args,
-            library_dirs = library_dirs,
-            libraries = libraries,
-            define_macros = define_macros + [('YY_NO_INPUT', None)],
-            include_dirs = include_dirs + [normpath("linkcheck/HtmlParser")],
-        ),
-    ],
    scripts = scripts,
    data_files = data_files,
    classifiers = [
--- a/windows/build.bat
+++ b/windows/build.bat
@ -38,7 +38,5 @@ if defined MSSdk (

 %PYDIR%\python.exe setup.py sdist --manifest-only
 %PYDIR%\python.exe setup.py build %COMPILER%
-:: copy .pyd files to start linkchecker in local directory
-copy build\lib.%PLATFORM%-%PYVER%\linkcheck\HtmlParser\htmlsax.pyd linkcheck\HtmlParser

 :finish
--- a/windows/clean.bat
+++ b/windows/clean.bat
@ -16,6 +16,5 @@
@echo off
 set PYDIR=C:\Python27
 %PYDIR%\python.exe setup.py clean --all
-del linkcheck\HtmlParser\htmlsax.pyd
 del doc\html\lccollection.qhc
 del doc\html\lcdoc.qch