wagtail/wagtail/admin/rich_text/converters/html_to_contentstate.py

import re
from html.parser import HTMLParser

from wagtail.admin.rich_text.converters.contentstate_models import (
    Block, ContentState, Entity, EntityRange, InlineStyleRange)
from wagtail.admin.rich_text.converters.html_ruleset import HTMLRuleset
from wagtail.core.models import Page
from wagtail.core.rich_text import features as feature_registry

# constants to keep track of what to do with leading whitespace on the next text node we encounter
STRIP_WHITESPACE = 0
KEEP_WHITESPACE = 1
FORCE_WHITESPACE = 2

WHITESPACE_RE = re.compile(r'\s+')


class HandlerState:
    def __init__(self):
        self.current_block = None
        self.current_inline_styles = []
        self.current_entity_ranges = []

        # what to do with leading whitespace on the next text node we encounter: strip, keep or force
        self.leading_whitespace = STRIP_WHITESPACE
        self.list_depth = 0
        self.list_item_type = None

        # an atomic block which is NOT preceded by a non-atomic block must have a spacer
        # paragraph inserted before it
        # NB This is not included in pushed/popped state, because after a pop() this
        # should still indicate the status of the most recent block, not the one preceding
        # the corresponding push()
        self.has_preceding_nonatomic_block = False

        self.pushed_states = []

    def push(self):
        self.pushed_states.append({
            'current_block': self.current_block,
            'current_inline_styles': self.current_inline_styles,
            'current_entity_ranges': self.current_entity_ranges,
            'leading_whitespace': self.leading_whitespace,
            'list_depth': self.list_depth,
            'list_item_type': self.list_item_type,
        })

    def pop(self):
        last_state = self.pushed_states.pop()
        self.current_block = last_state['current_block']
        self.current_inline_styles = last_state['current_inline_styles']
        self.current_entity_ranges = last_state['current_entity_ranges']
        self.leading_whitespace = last_state['leading_whitespace']
        self.list_depth = last_state['list_depth']
        self.list_item_type = last_state['list_item_type']


def add_paragraph_block(state, contentstate):
    """
    Utility function for adding an unstyled (paragraph) block to contentstate;
    useful for element handlers that aren't paragraph elements themselves, but need
    to insert paragraphs to ensure correctness
    """
    block = Block('unstyled', depth=state.list_depth)
    contentstate.blocks.append(block)
    state.current_block = block
    state.leading_whitespace = STRIP_WHITESPACE
    state.has_preceding_nonatomic_block = True


class ListElementHandler:
    """ Handler for <ul> / <ol> tags """
    def __init__(self, list_item_type):
        self.list_item_type = list_item_type

    def handle_starttag(self, name, attrs, state, contentstate):
        state.push()

        if state.list_item_type is None:
            # this is not nested in another list => depth remains unchanged
            pass
        else:
            # start the next nesting level
            state.list_depth += 1

        state.list_item_type = self.list_item_type

    def handle_endtag(self, name, state, contentstate):
        state.pop()


class BlockElementHandler:
    def __init__(self, block_type):
        self.block_type = block_type

    def create_block(self, name, attrs, state, contentstate):
        return Block(self.block_type, depth=state.list_depth)

    def handle_starttag(self, name, attrs, state, contentstate):
        attr_dict = dict(attrs)  # convert attrs from list of (name, value) tuples to a dict
        block = self.create_block(name, attr_dict, state, contentstate)
        contentstate.blocks.append(block)
        state.current_block = block
        state.leading_whitespace = STRIP_WHITESPACE
        state.has_preceding_nonatomic_block = True

    def handle_endtag(self, name, state, contentState):
        assert not state.current_inline_styles, "End of block reached without closing inline style elements"
        assert not state.current_entity_ranges, "End of block reached without closing entity elements"
        state.current_block = None


class ListItemElementHandler(BlockElementHandler):
    """ Handler for <li> tag """

    def __init__(self):
        pass  # skip setting self.block_type

    def create_block(self, name, attrs, state, contentstate):
        assert state.list_item_type is not None, "%s element found outside of an enclosing list element" % name
        return Block(state.list_item_type, depth=state.list_depth)


class InlineStyleElementHandler:
    def __init__(self, style):
        self.style = style

    def handle_starttag(self, name, attrs, state, contentstate):
        if state.current_block is None:
            # Inline style element encountered at the top level -
            # start a new paragraph block to contain it
            add_paragraph_block(state, contentstate)

        if state.leading_whitespace == FORCE_WHITESPACE:
            # any pending whitespace should be output before handling this tag,
            # and subsequent whitespace should be collapsed into it (= stripped)
            state.current_block.text += ' '
            state.leading_whitespace = STRIP_WHITESPACE

        inline_style_range = InlineStyleRange(self.style)
        inline_style_range.offset = len(state.current_block.text)
        state.current_block.inline_style_ranges.append(inline_style_range)
        state.current_inline_styles.append(inline_style_range)

    def handle_endtag(self, name, state, contentstate):
        inline_style_range = state.current_inline_styles.pop()
        assert inline_style_range.style == self.style
        inline_style_range.length = len(state.current_block.text) - inline_style_range.offset


class InlineEntityElementHandler:
    """
    Abstract superclass for elements that will be represented as inline entities.
    Subclasses should define a `mutability` property
    """
    def __init__(self, entity_type):
        self.entity_type = entity_type

    def handle_starttag(self, name, attrs, state, contentstate):
        if state.current_block is None:
            # Inline entity element encountered at the top level -
            # start a new paragraph block to contain it
            add_paragraph_block(state, contentstate)

        if state.leading_whitespace == FORCE_WHITESPACE:
            # any pending whitespace should be output before handling this tag,
            # and subsequent whitespace should be collapsed into it (= stripped)
            state.current_block.text += ' '
            state.leading_whitespace = STRIP_WHITESPACE

        # convert attrs from a list of (name, value) tuples to a dict
        # for get_attribute_data to work with
        attrs = dict(attrs)

        entity = Entity(self.entity_type, self.mutability, self.get_attribute_data(attrs))
        key = contentstate.add_entity(entity)

        entity_range = EntityRange(key)
        entity_range.offset = len(state.current_block.text)
        state.current_block.entity_ranges.append(entity_range)
        state.current_entity_ranges.append(entity_range)

    def get_attribute_data(self, attrs):
        """
        Given a dict of attributes found on the source element, return the data dict
        to be associated with the resulting entity
        """
        return {}

    def handle_endtag(self, name, state, contentstate):
        entity_range = state.current_entity_ranges.pop()
        entity_range.length = len(state.current_block.text) - entity_range.offset


class LinkElementHandler(InlineEntityElementHandler):
    mutability = 'MUTABLE'


class ExternalLinkElementHandler(LinkElementHandler):
    def get_attribute_data(self, attrs):
        return {'url': attrs['href']}


class PageLinkElementHandler(LinkElementHandler):
    def get_attribute_data(self, attrs):
        try:
            page = Page.objects.get(id=attrs['id']).specific
        except Page.DoesNotExist:
            # retain ID so that it's still identified as a page link (albeit a broken one)
            return {
                'id': int(attrs['id']),
                'url': None,
                'parentId': None
            }

        parent_page = page.get_parent()

        return {
            'id': page.id,
            'url': page.url,
            'parentId': parent_page.id if parent_page else None,
        }


class AtomicBlockEntityElementHandler:
    """
    Handler for elements like <img> that exist as a single immutable item at the block level
    """
    def handle_starttag(self, name, attrs, state, contentstate):
        # forcibly close any block that illegally contains this one
        state.current_block = None

        if not state.has_preceding_nonatomic_block:
            # if this block is NOT preceded by a non-atomic block,
            # need to insert a spacer paragraph
            add_paragraph_block(state, contentstate)

        attr_dict = dict(attrs)  # convert attrs from list of (name, value) tuples to a dict
        entity = self.create_entity(name, attr_dict, state, contentstate)
        key = contentstate.add_entity(entity)

        block = Block('atomic', depth=state.list_depth)
        contentstate.blocks.append(block)
        block.text = ' '
        entity_range = EntityRange(key)
        entity_range.offset = 0
        entity_range.length = 1
        block.entity_ranges.append(entity_range)
        state.has_preceding_nonatomic_block = False

    def handle_endtag(self, name, state, contentstate):
        pass


class HorizontalRuleHandler(AtomicBlockEntityElementHandler):
    def create_entity(self, name, attrs, state, contentstate):
        return Entity('HORIZONTAL_RULE', 'IMMUTABLE', {})


class LineBreakHandler:
    def handle_starttag(self, name, attrs, state, contentstate):
        if state.current_block is None:
            # ignore line breaks that exist at the top level
            return

        state.current_block.text += '\n'

    def handle_endtag(self, name, state, contentstate):
        pass


class HtmlToContentStateHandler(HTMLParser):
    def __init__(self, features=()):
        self.paragraph_handler = BlockElementHandler('unstyled')
        self.element_handlers = HTMLRuleset({
            'p': self.paragraph_handler,
            'br': LineBreakHandler(),
        })
        for feature in features:
            rule = feature_registry.get_converter_rule('contentstate', feature)
            if rule is not None:
                self.element_handlers.add_rules(rule['from_database_format'])

        super().__init__(convert_charrefs=True)

    def reset(self):
        self.state = HandlerState()
        self.contentstate = ContentState()

        # stack of (name, handler) tuples for the elements we're currently inside
        self.open_elements = []

        super().reset()

    def handle_starttag(self, name, attrs):
        attr_dict = dict(attrs)  # convert attrs from list of (name, value) tuples to a dict
        element_handler = self.element_handlers.match(name, attr_dict)

        if element_handler is None and not self.open_elements:
            # treat unrecognised top-level elements as paragraphs
            element_handler = self.paragraph_handler

        self.open_elements.append((name, element_handler))

        if element_handler:
            element_handler.handle_starttag(name, attrs, self.state, self.contentstate)

    def handle_endtag(self, name):
        if not self.open_elements:
            return  # avoid a pop from an empty list if we have an extra end tag
        expected_name, element_handler = self.open_elements.pop()
        assert name == expected_name, "Unmatched tags: expected %s, got %s" % (expected_name, name)
        if element_handler:
            element_handler.handle_endtag(name, self.state, self.contentstate)

    def handle_data(self, content):
        # normalise whitespace sequences to a single space
        content = re.sub(WHITESPACE_RE, ' ', content)

        if self.state.current_block is None:
            if content == ' ':
                # ignore top-level whitespace
                return
            else:
                # create a new paragraph block for this content
                add_paragraph_block(self.state, self.contentstate)

        if content == ' ':
            # if leading_whitespace = strip, this whitespace node is not significant
            #   and should be skipped.
            # For other cases, _don't_ output the whitespace yet, but set leading_whitespace = force
            # so that a space is forced before the next text node or inline element. If no such node
            # appears (= we reach the end of the block), the whitespace can rightfully be dropped.
            if self.state.leading_whitespace != STRIP_WHITESPACE:
                self.state.leading_whitespace = FORCE_WHITESPACE
        else:
            # strip or add leading whitespace according to the leading_whitespace flag
            if self.state.leading_whitespace == STRIP_WHITESPACE:
                content = content.lstrip()
            elif self.state.leading_whitespace == FORCE_WHITESPACE and not content.startswith(' '):
                content = ' ' + content

            if content.endswith(' '):
                # don't output trailing whitespace yet, because we want to discard it if the end
                # of the block follows. Instead, we'll set leading_whitespace = force so that
                # any following text or inline element will be prefixed by a space
                content = content.rstrip()
                self.state.leading_whitespace = FORCE_WHITESPACE
            else:
                # no trailing whitespace here - any leading whitespace at the start of the
                # next text node should be respected
                self.state.leading_whitespace = KEEP_WHITESPACE

            self.state.current_block.text += content

    def close(self):
        # if content ends in an atomic block (or is empty), need to append a spacer paragraph
        if not self.state.has_preceding_nonatomic_block:
            add_paragraph_block(self.state, self.contentstate)
        super().close()