linkchecker/linkcheck/HtmlParser/htmlsax.py

# Copyright (C) 2000-2018 Petr Dlouhy
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML parser implemented using Beautiful Soup and html.parser.
"""

from io import BytesIO, StringIO

from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
                 Tag)

from ..containers import ListDict


class Parser(object):
    handler = None
    encoding = None

    def __init__(self, handler):
        self.handler = handler
        self.reset()

    def feed(self, feed_text):
        if not self.html_doc:
            if isinstance(feed_text, bytes):
                self.html_doc = BytesIO()
            else:
                self.html_doc = StringIO()
        self.html_doc.write(feed_text)

    def reset(self):
        self.html_doc = None

    def parse_contents(self, contents):
        for content in contents:
            if isinstance(content, Tag):
                attrs = ListDict()
                for k, v_list in sorted(content.attrs.items()):
                    if not isinstance(v_list, list):
                        v_list = [v_list]
                    for v in v_list:
                        # empty parameters returned by BS4
                        # are sometimes in bytes:
                        if v == b'':
                            v = u''
                        attrs[k] = v
                if content.is_empty_element:
                    self.handler.start_end_element(
                        content.name, attrs, content.text.strip(),
                    )
                else:
                    self.handler.start_element(
                        content.name, attrs, content.text.strip(),
                    )
                    if hasattr(content, 'contents'):  # recursion
                        self.parse_contents(content.contents)
                    if hasattr(self.handler, 'end_element'):
                        self.handler.end_element(content.name)
                if content.comments:
                    for comment in content.comments:
                        if hasattr(self.handler, 'comment'):
                            self.handler.comment(comment)
            elif isinstance(content, Doctype):
                if hasattr(self.handler, 'doctype'):
                    self.handler.doctype(content[7:])
            elif isinstance(content, Comment):
                if hasattr(self.handler, 'comment'):
                    self.handler.comment(content.strip())
            elif isinstance(content, CData):
                if hasattr(self.handler, 'cdata'):
                    self.handler.cdata(content)
            elif isinstance(content, ProcessingInstruction):
                if hasattr(self.handler, 'pi'):
                    self.handler.pi(content.strip("? "))
            else:
                if hasattr(self.handler, 'characters'):
                    self.handler.characters(content)

    def flush(self):
        soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
        if hasattr(soup, 'contents'):
            self.parse_contents(soup.contents)
        self.encoding = soup.original_encoding

    def debug(self, text):
        raise NotImplementedError("debug is not implemented")

    def lineno(self):
        # It seems, that getting line number of element is not
        # implemented in BeautifulSoup, so this is faked
        return 0

    def last_lineno(self):
        return 0

    def column(self):
        return 0

    def last_column(self):
        return 0

    def pos(self, text):
        return 0


def parser(handler=None):
    return Parser(handler)