Remove home-cooked htmlparser and use BeautifulSoup

This commit is contained in:
Petr Dlouhý 2019-07-22 19:59:37 +01:00 committed by Chris Mayo
parent d1844a526e
commit 51a06d8a1e
19 changed files with 124 additions and 11071 deletions

View file

@ -7,11 +7,6 @@ include cgi-bin/lc.wsgi cgi-bin/README
include Makefile
include cgi-bin/lconline/*.html cgi-bin/lconline/*.de cgi-bin/lconline/*.en
include cgi-bin/lconline/*.js cgi-bin/lconline/*.css cgi-bin/lconline/*.ico
include linkcheck/HtmlParser/Makefile
include linkcheck/HtmlParser/htmllex.l
include linkcheck/HtmlParser/htmlparse.y
include linkcheck/HtmlParser/*.h
include linkcheck/HtmlParser/fixincludes.awk
include po/*.po po/*.mo po/*.pot po/Makefile
include doc/*.example doc/*.txt
include doc/html/*.ico

View file

@ -53,7 +53,6 @@ all:
clean:
-$(PYTHON) setup.py clean --all
rm -f $(LAPPNAME)-out.* *-stamp*
$(MAKE) -C linkcheck/HtmlParser clean
find . -name '*.py[co]' -exec rm -f {} \;
find . -name '*.bak' -exec rm -f {} \;
find . -depth -name '__pycache__' -exec rm -rf {} \;
@ -75,9 +74,7 @@ locale:
# to build in the current directory
localbuild: MANIFEST locale
$(MAKE) -C linkcheck/HtmlParser
$(PYTHON) setup.py build
cp -f build/lib.$(PLATFORM)-$(PYVER)*/linkcheck/HtmlParser/htmlsax*.so linkcheck/HtmlParser
release: distclean releasecheck filescheck
$(MAKE) dist sign register upload homepage tag changelog deb

View file

@ -1,29 +0,0 @@
# This HTML parser needs flex >= 2.5.xx from http://lex.sf.net/ for
# reentrant bison parser support and uses features of bison >= 3.0.x
LEX = flex
YACC = bison
PYINCLUDE=-I/usr/include/python2.7
all: htmllex.c htmlparse.c
htmlsax.so: htmllex.o htmlparse.o s_util.o
gcc -pthread -shared $^ -o htmlsax.so
%.o: %.c
gcc -g -std=c99 -O3 -Wall -pedantic -Wstrict-prototypes -fPIC -I. $(PYINCLUDE) -c $< -o $@
htmlparse.h htmlparse.c: htmlparse.y htmlsax.h
$(YACC) --output=htmlparse.c htmlparse.y
htmllex.l: htmlparse.h
htmllex.c: htmllex.l htmlsax.h
$(LEX) htmllex.l
awk -f fixincludes.awk htmllex.c > htmllex.c.fixed; mv -f htmllex.c.fixed htmllex.c
clean:
rm -f *.o *.so *.pyc *.pyo *.output
distclean: clean
rm -f htmlparse.c htmlparse.h htmllex.c

View file

@ -15,64 +15,7 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Fast HTML parser module written in C with the following features:
- Reentrant
As soon as any HTML string data is available, we try to feed it
to the HTML parser. This means that the parser has to scan possible
incomplete data, recognizing as much as it can. Incomplete trailing
data is saved for subsequent calls, or it is just flushed into the
output buffer with the flush() function.
A reset() brings the parser back to its initial state, throwing away all
buffered data.
- Coping with HTML syntax errors
The parser recognizes as much as it can and passes the rest
of the data as TEXT tokens.
The scanner only passes complete recognized HTML syntax elements to
the parser. Invalid syntax elements are passed as TEXT. This way we do
not need the bison error recovery.
Incomplete data is rescanned the next time the parser calls yylex() or
when it is being flush()ed.
The following syntax errors will be recognized correctly:
- Unquoted attribute values.
- Missing beginning quote of attribute values.
- Invalid "</...>" end tags in script modus.
- Missing ">" in tags.
- Invalid characters in tag or attribute names.
The following syntax errors will not be recognized:
- Missing end quote of attribute values. On the TODO list.
- Unknown HTML tag or attribute names.
- Invalid nesting of tags.
Additionally the parser has the following features:
- NULL bytes are changed into spaces
- <!-- ... --> inside a <script> or <style> are not treated as
comments but as DATA
- Rewrites all tag and attribute names to lowercase for easier
matching.
- Speed
The FLEX code is configured to generate a large but fast scanner.
The parser ignores forbidden or unnecessary HTML end tags.
The parser converts tag and attribute names to lower case for easier
matching.
The parser quotes all attribute values.
Python memory management interface is used.
- Character encoding aware
The parser itself is not encoding aware, but output strings are
always Python Unicode strings.
- Retain HTML attribute order
The parser keeps the order in which HTML tag attributes are parsed.
The attributes are stored in a custom dictionary class ListDict which
iterates over the dictionary keys in insertion order.
HTML parser module.
USAGE

View file

@ -1,7 +0,0 @@
# Add htmlsax.h include as first line of file. This is needed to let
# Python.h be included before any system headers.
# See also http://docs.python.org/api/includes.html
BEGIN {
print "#include \"htmlsax.h\"";
}
{ print; }

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -87,7 +87,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
data = data.encode(self.encoding, "ignore")
self.fd.write("<!--%s-->" % data)
def start_element (self, tag, attrs):
@ -102,7 +101,7 @@ class HtmlPrettyPrinter (object):
"""
self._start_element(tag, attrs, ">")
def start_end_element (self, tag, attrs):
def start_end_element (self, tag, attrs, element_text=None):
"""
Print HTML start-end element.
@ -126,14 +125,11 @@ class HtmlPrettyPrinter (object):
@type end: string
@return: None
"""
tag = tag.encode(self.encoding, "ignore")
self.fd.write("<%s" % tag.replace("/", ""))
for key, val in attrs.items():
key = key.encode(self.encoding, "ignore")
if val is None:
self.fd.write(" %s" % key)
else:
val = val.encode(self.encoding, "ignore")
self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
self.fd.write(end)
@ -145,7 +141,6 @@ class HtmlPrettyPrinter (object):
@type tag: string
@return: None
"""
tag = tag.encode(self.encoding, "ignore")
self.fd.write("</%s>" % tag)
def doctype (self, data):
@ -156,7 +151,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
data = data.encode(self.encoding, "ignore")
self.fd.write("<!DOCTYPE%s>" % data)
def pi (self, data):
@ -167,7 +161,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
data = data.encode(self.encoding, "ignore")
self.fd.write("<?%s?>" % data)
def cdata (self, data):
@ -178,7 +171,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
data = data.encode(self.encoding, "ignore")
self.fd.write("<![CDATA[%s]]>" % data)
def characters (self, data):
@ -189,7 +181,6 @@ class HtmlPrettyPrinter (object):
@type data: string
@return: None
"""
data = data.encode(self.encoding, "ignore")
self.fd.write(data)

File diff suppressed because it is too large Load diff

View file

@ -1,74 +0,0 @@
/* A Bison parser, made by GNU Bison 3.0.4. */
/* Bison interface for Yacc-like parsers in C
Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
/* As a special exception, you may create a larger work that contains
part or all of the Bison parser skeleton and distribute that work
under terms of your choice, so long as that work isn't itself a
parser generator using the skeleton or a modified version thereof
as a parser skeleton. Alternatively, if you modify or redistribute
the parser skeleton itself, you may (at your option) remove this
special exception, which will cause the skeleton and the resulting
Bison output files to be licensed under the GNU General Public
License without this special exception.
This special exception was added by the Free Software Foundation in
version 2.2 of Bison. */
#ifndef YY_YY_HTMLPARSE_H_INCLUDED
# define YY_YY_HTMLPARSE_H_INCLUDED
/* Debug traces. */
#ifndef YYDEBUG
# define YYDEBUG 1
#endif
#if YYDEBUG
extern int yydebug;
#endif
/* Token type. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
enum yytokentype
{
T_WAIT = 258,
T_ERROR = 259,
T_TEXT = 260,
T_ELEMENT_START = 261,
T_ELEMENT_START_END = 262,
T_ELEMENT_END = 263,
T_SCRIPT = 264,
T_STYLE = 265,
T_PI = 266,
T_COMMENT = 267,
T_CDATA = 268,
T_DOCTYPE = 269
};
#endif
/* Value type. */
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
typedef int YYSTYPE;
# define YYSTYPE_IS_TRIVIAL 1
# define YYSTYPE_IS_DECLARED 1
#endif
int yyparse (PyObject* scanner);
#endif /* !YY_YY_HTMLPARSE_H_INCLUDED */

File diff suppressed because it is too large Load diff

View file

@ -1,81 +0,0 @@
/* Copyright (C) 2000-2014 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
/*
Includes header definitions for the HTML Sax parser Python module.
*/
#ifndef HTMLSAX_H
#define HTMLSAX_H
#include "Python.h"
/* require Python >= 2.6 */
#ifndef PY_VERSION_HEX
#error please install Python >= 2.6
#endif
#if PY_VERSION_HEX < 0x02060000
#error please install Python >= 2.6
#endif
/* user_data type for SAX calls */
typedef struct {
/* the Python SAX object to issue callbacks */
PyObject* handler;
/* Buffer to store still-to-be-scanned characters. After recognizing
* a complete syntax element, all data up to bufpos will be removed.
* Before scanning you should append new data to this buffer.
*/
char* buf;
/* current position in the buffer counting from zero */
unsigned int bufpos;
/* current position of next syntax element */
unsigned int nextpos;
/* position in the stream of data already seen, counting from zero */
unsigned int pos;
/* line counter, counting from one */
unsigned int lineno;
/* column counter, counting from zero */
unsigned int column;
/* value of line counter before the current token */
unsigned int last_lineno;
/* value of column counter before the current token */
unsigned int last_column;
/* input buffer of lexer, must be deleted when the parsing stops */
void* lexbuf;
/* temporary character buffer */
char* tmp_buf;
/* temporary HTML start or end tag name */
PyObject* tmp_tag;
/* temporary HTML start tag attribute name */
PyObject* tmp_attrname;
/* temporary HTML start tag attribute value */
PyObject* tmp_attrval;
/* temporary HTML start tag attribute list (a SortedDict) */
PyObject* tmp_attrs;
/* HtmlParser.resolve_entities */
PyObject* resolve_entities;
/* HtmlParser.SortedDict */
PyObject* list_dict;
/* stored Python exception (if error occurred in scanner) */
PyObject* exc_type;
PyObject* exc_val;
PyObject* exc_tb;
/* the parser object itself */
PyObject* parser;
} UserData;
#endif

View file

@ -0,0 +1,120 @@
# Copyright (C) 2000-2018 Petr Dlouhy
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML parser implemented using Beautiful Soup and html.parser.
"""
from io import BytesIO, StringIO
from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
Tag)
from ..containers import ListDict
class Parser(object):
handler = None
encoding = None
def __init__(self, handler):
self.handler = handler
self.reset()
def feed(self, feed_text):
if not self.html_doc:
if isinstance(feed_text, bytes):
self.html_doc = BytesIO()
else:
self.html_doc = StringIO()
self.html_doc.write(feed_text)
def reset(self):
self.html_doc = None
def parse_contents(self, contents):
for content in contents:
if isinstance(content, Tag):
attrs = ListDict()
for k, v_list in sorted(content.attrs.items()):
if not isinstance(v_list, list):
v_list = [v_list]
for v in v_list:
# empty parameters returned by BS4
# are sometimes in bytes:
if v == b'':
v = u''
attrs[k] = v
if content.is_empty_element:
self.handler.start_end_element(
content.name, attrs, content.text.strip(),
)
else:
self.handler.start_element(
content.name, attrs, content.text.strip(),
)
if hasattr(content, 'contents'): # recursion
self.parse_contents(content.contents)
if hasattr(self.handler, 'end_element'):
self.handler.end_element(content.name)
if content.comments:
for comment in content.comments:
if hasattr(self.handler, 'comment'):
self.handler.comment(comment)
elif isinstance(content, Doctype):
if hasattr(self.handler, 'doctype'):
self.handler.doctype(content[7:])
elif isinstance(content, Comment):
if hasattr(self.handler, 'comment'):
self.handler.comment(content.strip())
elif isinstance(content, CData):
if hasattr(self.handler, 'cdata'):
self.handler.cdata(content)
elif isinstance(content, ProcessingInstruction):
if hasattr(self.handler, 'pi'):
self.handler.pi(content.strip("? "))
else:
if hasattr(self.handler, 'characters'):
self.handler.characters(content)
def flush(self):
soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
if hasattr(soup, 'contents'):
self.parse_contents(soup.contents)
self.encoding = soup.original_encoding
def debug(self, text):
raise NotImplementedError("debug is not implemented")
def lineno(self):
# It seems, that getting line number of element is not
# implemented in BeautifulSoup, so this is faked
return 0
def last_lineno(self):
return 0
def column(self):
return 0
def last_column(self):
return 0
def pos(self, text):
return 0
def parser(handler=None):
return Parser(handler)

View file

@ -1,52 +0,0 @@
/*
* linux/lib/string.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include "Python.h"
#if !defined(HAVE_STRLCPY)
/**
* strlcpy - Copy a %NUL terminated string into a sized buffer
* @dst: Where to copy the string to
* @src: Where to copy the string from
* @size: size of destination buffer
*
* Compatible with *BSD: the result is always a valid
* NUL-terminated string that fits in the buffer (unless,
* of course, the buffer size is zero). It does not pad
* out the result like strncpy() does.
*/
size_t strlcpy (char *dst, const char *src, size_t size)
{
size_t ret = strlen(src);
if (size > 0) {
size_t len = (ret >= size) ? size-1 : ret;
Py_MEMCPY(dst, src, len);
dst[len] = '\0';
}
return ret;
}
#endif /* !HAVE_STRLCPY */
#if !defined(HAVE_STRLCAT)
/**
* strlcat - Append a length-limited, %NUL-terminated string to another
* @dst: The string to be appended to
* @src: The string to append to it
* @size: The size of the destination buffer.
*/
size_t strlcat (char *dst, const char *src, size_t size)
{
size_t dsize = strlen(dst);
size_t len = strlen(src);
size_t res = dsize + len;
dst += dsize;
size -= dsize;
if (len >= size)
len = size-1;
Py_MEMCPY(dst, src, len);
dst[len] = '\0';
return res;
}
#endif /* !HAVE_STRLCAT */

View file

@ -1,12 +0,0 @@
/*
* linux/lib/string.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#if !defined(HAVE_STRLCPY)
size_t strlcpy(char *dst, const char *src, size_t size);
#endif /* !HAVE_STRLCPY */
#if !defined(HAVE_STRLCAT)
size_t strlcat(char *dst, const char *src, size_t size);
#endif /* !HAVE_STRLCAT */

View file

@ -115,10 +115,10 @@ class TagFinder (object):
"""Does nothing, override in a subclass."""
pass
def start_end_element (self, tag, attrs):
def start_end_element (self, tag, attrs, element_text=None):
"""Delegate a combined start/end element (eg. <br/>) to
the start_element method. Ignore the end element part."""
self.start_element(tag, attrs)
self.start_element(tag, attrs, element_text)
class MetaRobotsFinder (TagFinder):

View file

@ -466,20 +466,6 @@ args = dict(
'linkcheck.parser',
'linkcheck.plugins',
],
ext_modules = [
Extension('linkcheck.HtmlParser.htmlsax',
sources = [
'linkcheck/HtmlParser/htmllex.c',
'linkcheck/HtmlParser/htmlparse.c',
'linkcheck/HtmlParser/s_util.c',
],
extra_compile_args = extra_compile_args,
library_dirs = library_dirs,
libraries = libraries,
define_macros = define_macros + [('YY_NO_INPUT', None)],
include_dirs = include_dirs + [normpath("linkcheck/HtmlParser")],
),
],
scripts = scripts,
data_files = data_files,
classifiers = [

View file

@ -38,7 +38,5 @@ if defined MSSdk (
%PYDIR%\python.exe setup.py sdist --manifest-only
%PYDIR%\python.exe setup.py build %COMPILER%
:: copy .pyd files to start linkchecker in local directory
copy build\lib.%PLATFORM%-%PYVER%\linkcheck\HtmlParser\htmlsax.pyd linkcheck\HtmlParser
:finish

View file

@ -16,6 +16,5 @@
@echo off
set PYDIR=C:\Python27
%PYDIR%\python.exe setup.py clean --all
del linkcheck\HtmlParser\htmlsax.pyd
del doc\html\lccollection.qhc
del doc\html\lcdoc.qch