python-markdown-oembed/src/python_markdown_oembed_extension/inlinepatterns.py

import logging
from posixpath import splitext
from urllib.parse import urlparse

import nh3
import oembed
from markdown.inlinepatterns import InlineProcessor
from xml.etree.ElementTree import Element

LOG = logging.getLogger(__name__)

# Image extensions to exclude from oEmbed processing
_IMAGE_EXTENSIONS = frozenset({
    ".png", ".jpg", ".jpeg", ".gif", ".avif", ".webp",
    ".svg", ".bmp", ".tiff", ".ico",
})

# Matches Markdown image syntax with an absolute URL: ![alt](https://...)
OEMBED_LINK_RE = r"!\[([^\]]*)\]\(((?:https?:)?//[^\)]+)\)"

# Allowed HTML tags and attributes for sanitizing oEmbed responses
_SANITIZE_TAGS = {"iframe", "video", "audio", "source", "img", "blockquote", "div", "p", "a", "span", "figure"}
_SANITIZE_ATTRS = {
    "*": {"class", "style", "title"},
    "iframe": {"src", "width", "height", "frameborder", "allowfullscreen", "allow", "referrerpolicy", "sandbox"},
    "video": {"src", "width", "height", "controls", "autoplay", "loop", "muted", "poster", "preload"},
    "audio": {"src", "controls", "autoplay", "loop", "muted", "preload"},
    "source": {"src", "type"},
    "img": {"src", "alt", "width", "height", "loading"},
    "a": {"href", "target"},
}


def _is_image_url(url: str) -> bool:
    """Check if a URL points to an image based on its path extension."""
    try:
        path = urlparse(url).path
        _, ext = splitext(path)
        return ext.lower() in _IMAGE_EXTENSIONS
    except Exception:
        return False


def _sanitize_html(html: str) -> str:
    """Sanitize oEmbed HTML to prevent XSS."""
    return nh3.clean(html, tags=_SANITIZE_TAGS, attributes=_SANITIZE_ATTRS)


class OEmbedLinkPattern(InlineProcessor):
    """Inline processor that replaces Markdown image links with oEmbed content."""

    def __init__(self, pattern, md=None, oembed_consumer=None, wrapper_class="oembed"):
        super().__init__(pattern, md)
        self.consumer = oembed_consumer
        self.wrapper_class = wrapper_class

    def handleMatch(self, m, data):
        url = m.group(2).strip()
        alt = m.group(1)

        # Skip image URLs — let Markdown's default image handler process them
        if _is_image_url(url):
            return None, None, None

        html = self._get_oembed_html(url, alt)
        if html is None:
            return None, None, None

        html = _sanitize_html(html)
        if self.wrapper_class:
            html = f'<figure class="{self.wrapper_class}">{html}</figure>'

        # Stash raw HTML so it survives Markdown's escaping; place the
        # placeholder inside an inline element that the tree-processor will
        # later replace with the real HTML.
        placeholder = self.md.htmlStash.store(html)
        el = Element("span")
        el.text = placeholder
        return el, m.start(0), m.end(0)

    def _get_oembed_html(self, url: str, alt: str = "") -> str | None:
        """Fetch oEmbed HTML for a URL, handling different response types."""
        try:
            response = self.consumer.embed(url)
        except oembed.OEmbedNoEndpoint:
            LOG.warning("No oEmbed endpoint for URL: %s", url)
            return None
        except Exception:
            LOG.exception("Error fetching oEmbed for URL: %s", url)
            return None

        # oEmbed 'video' and 'rich' types include an 'html' field
        html = response.get("html")
        if html:
            return html

        # oEmbed 'photo' type — construct an <img> tag
        photo_url = response.get("url")
        if photo_url:
            width = response.get("width", "")
            height = response.get("height", "")
            escaped_alt = alt.replace('"', "&quot;")
            return (
                f'<img src="{photo_url}" alt="{escaped_alt}"'
                f' width="{width}" height="{height}" />'
            )

        LOG.warning("oEmbed response for %s has no 'html' or 'url' field", url)
        return None
initial commit Functional beta 2012-11-13 21:28:10 +00:00			`import logging`
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`from posixpath import splitext`
			`from urllib.parse import urlparse`
initial commit Functional beta 2012-11-13 21:28:10 +00:00
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`import nh3`
			`import oembed`
			`from markdown.inlinepatterns import InlineProcessor`
			`from xml.etree.ElementTree import Element`
initial commit Functional beta 2012-11-13 21:28:10 +00:00
			`LOG = logging.getLogger(__name__)`

Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`# Image extensions to exclude from oEmbed processing`
			`_IMAGE_EXTENSIONS = frozenset({`
			`".png", ".jpg", ".jpeg", ".gif", ".avif", ".webp",`
			`".svg", ".bmp", ".tiff", ".ico",`
			`})`

			`# Matches Markdown image syntax with an absolute URL: ![alt](https://...)`
			`OEMBED_LINK_RE = r"!\[([^\]]*)\]\(((?:https?:)?//[^\)]+)\)"`

			`# Allowed HTML tags and attributes for sanitizing oEmbed responses`
			`_SANITIZE_TAGS = {"iframe", "video", "audio", "source", "img", "blockquote", "div", "p", "a", "span", "figure"}`
			`_SANITIZE_ATTRS = {`
			`"*": {"class", "style", "title"},`
			`"iframe": {"src", "width", "height", "frameborder", "allowfullscreen", "allow", "referrerpolicy", "sandbox"},`
			`"video": {"src", "width", "height", "controls", "autoplay", "loop", "muted", "poster", "preload"},`
			`"audio": {"src", "controls", "autoplay", "loop", "muted", "preload"},`
			`"source": {"src", "type"},`
			`"img": {"src", "alt", "width", "height", "loading"},`
			`"a": {"href", "target"},`
			`}`
initial commit Functional beta 2012-11-13 21:28:10 +00:00

Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`def _is_image_url(url: str) -> bool:`
			`"""Check if a URL points to an image based on its path extension."""`
			`try:`
			`path = urlparse(url).path`
			`_, ext = splitext(path)`
			`return ext.lower() in _IMAGE_EXTENSIONS`
			`except Exception:`
			`return False`
initial commit Functional beta 2012-11-13 21:28:10 +00:00

Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`def _sanitize_html(html: str) -> str:`
			`"""Sanitize oEmbed HTML to prevent XSS."""`
			`return nh3.clean(html, tags=_SANITIZE_TAGS, attributes=_SANITIZE_ATTRS)`


			`class OEmbedLinkPattern(InlineProcessor):`
			`"""Inline processor that replaces Markdown image links with oEmbed content."""`

			`def __init__(self, pattern, md=None, oembed_consumer=None, wrapper_class="oembed"):`
			`super().__init__(pattern, md)`
initial commit Functional beta 2012-11-13 21:28:10 +00:00			`self.consumer = oembed_consumer`
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`self.wrapper_class = wrapper_class`

			`def handleMatch(self, m, data):`
			`url = m.group(2).strip()`
			`alt = m.group(1)`
initial commit Functional beta 2012-11-13 21:28:10 +00:00
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`# Skip image URLs — let Markdown's default image handler process them`
			`if _is_image_url(url):`
			`return None, None, None`

			`html = self._get_oembed_html(url, alt)`
improve portability 2012-11-14 18:15:41 +00:00			`if html is None:`
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`return None, None, None`

			`html = _sanitize_html(html)`
			`if self.wrapper_class:`
			`html = f'<figure class="{self.wrapper_class}">{html}</figure>'`

			`# Stash raw HTML so it survives Markdown's escaping; place the`
			`# placeholder inside an inline element that the tree-processor will`
			`# later replace with the real HTML.`
			`placeholder = self.md.htmlStash.store(html)`
			`el = Element("span")`
			`el.text = placeholder`
			`return el, m.start(0), m.end(0)`
improve portability 2012-11-14 18:15:41 +00:00
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`def _get_oembed_html(self, url: str, alt: str = "") -> str \| None:`
			`"""Fetch oEmbed HTML for a URL, handling different response types."""`
allow configuring which endpoints to allow 2012-11-13 23:28:04 +00:00			`try:`
			`response = self.consumer.embed(url)`
			`except oembed.OEmbedNoEndpoint:`
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`LOG.warning("No oEmbed endpoint for URL: %s", url)`
allow configuring which endpoints to allow 2012-11-13 23:28:04 +00:00			`return None`
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00			`except Exception:`
			`LOG.exception("Error fetching oEmbed for URL: %s", url)`
Update inlinepatterns.py 2020-04-07 12:37:38 +00:00			`return None`
Refactor and modernize codebase. 2026-03-02 16:25:43 +00:00
			`# oEmbed 'video' and 'rich' types include an 'html' field`
			`html = response.get("html")`
			`if html:`
			`return html`

			`# oEmbed 'photo' type — construct an <img> tag`
			`photo_url = response.get("url")`
			`if photo_url:`
			`width = response.get("width", "")`
			`height = response.get("height", "")`
			`escaped_alt = alt.replace('"', """)`
			`return (`
			`f'<img src="{photo_url}" alt="{escaped_alt}"'`
			`f' width="{width}" height="{height}" />'`
			`)`

			`LOG.warning("oEmbed response for %s has no 'html' or 'url' field", url)`
			`return None`