2026-03-03 13:26:52 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import html as _html
|
2012-11-13 21:28:10 +00:00
|
|
|
import logging
|
2026-03-03 13:26:52 +00:00
|
|
|
from os.path import splitext
|
2026-03-02 16:25:43 +00:00
|
|
|
from urllib.parse import urlparse
|
2026-03-03 13:26:52 +00:00
|
|
|
from xml.etree.ElementTree import Element
|
2012-11-13 21:28:10 +00:00
|
|
|
|
2026-03-03 13:26:52 +00:00
|
|
|
import markdown
|
2026-03-02 16:25:43 +00:00
|
|
|
import nh3
|
|
|
|
|
from markdown.inlinepatterns import InlineProcessor
|
2026-03-03 13:26:52 +00:00
|
|
|
|
|
|
|
|
from mdx_oembed.oembed import OEmbedConsumer, OEmbedNoEndpoint
|
2012-11-13 21:28:10 +00:00
|
|
|
|
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
# Image extensions to exclude from oEmbed processing
|
|
|
|
|
_IMAGE_EXTENSIONS = frozenset({
|
|
|
|
|
".png", ".jpg", ".jpeg", ".gif", ".avif", ".webp",
|
|
|
|
|
".svg", ".bmp", ".tiff", ".ico",
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
# Matches Markdown image syntax with an absolute URL: 
|
|
|
|
|
OEMBED_LINK_RE = r"!\[([^\]]*)\]\(((?:https?:)?//[^\)]+)\)"
|
|
|
|
|
|
|
|
|
|
# Allowed HTML tags and attributes for sanitizing oEmbed responses
|
2026-03-03 13:26:52 +00:00
|
|
|
_SANITIZE_TAGS = {
|
|
|
|
|
"iframe", "video", "audio", "source", "img",
|
|
|
|
|
"blockquote", "div", "p", "a", "span", "figure",
|
|
|
|
|
}
|
2026-03-02 16:25:43 +00:00
|
|
|
_SANITIZE_ATTRS = {
|
|
|
|
|
"*": {"class", "style", "title"},
|
2026-03-03 13:26:52 +00:00
|
|
|
"iframe": {
|
|
|
|
|
"src", "width", "height", "frameborder",
|
|
|
|
|
"allowfullscreen", "allow", "referrerpolicy", "sandbox",
|
|
|
|
|
},
|
|
|
|
|
"video": {
|
|
|
|
|
"src", "width", "height", "controls",
|
|
|
|
|
"autoplay", "loop", "muted", "poster", "preload",
|
|
|
|
|
},
|
|
|
|
|
"audio": {
|
|
|
|
|
"src", "controls", "autoplay", "loop", "muted", "preload",
|
|
|
|
|
},
|
2026-03-02 16:25:43 +00:00
|
|
|
"source": {"src", "type"},
|
|
|
|
|
"img": {"src", "alt", "width", "height", "loading"},
|
|
|
|
|
"a": {"href", "target"},
|
|
|
|
|
}
|
2012-11-13 21:28:10 +00:00
|
|
|
|
|
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
def _is_image_url(url: str) -> bool:
|
|
|
|
|
"""Check if a URL points to an image based on its path extension."""
|
|
|
|
|
try:
|
|
|
|
|
path = urlparse(url).path
|
|
|
|
|
_, ext = splitext(path)
|
|
|
|
|
return ext.lower() in _IMAGE_EXTENSIONS
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
2012-11-13 21:28:10 +00:00
|
|
|
|
|
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
def _sanitize_html(html: str) -> str:
|
|
|
|
|
"""Sanitize oEmbed HTML to prevent XSS."""
|
|
|
|
|
return nh3.clean(html, tags=_SANITIZE_TAGS, attributes=_SANITIZE_ATTRS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class OEmbedLinkPattern(InlineProcessor):
|
|
|
|
|
"""Inline processor that replaces Markdown image links with oEmbed content."""
|
|
|
|
|
|
2026-03-03 13:26:52 +00:00
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
pattern: str,
|
|
|
|
|
md: markdown.Markdown | None = None,
|
|
|
|
|
oembed_consumer: OEmbedConsumer | None = None,
|
|
|
|
|
wrapper_class: str = "oembed",
|
|
|
|
|
) -> None:
|
2026-03-02 16:25:43 +00:00
|
|
|
super().__init__(pattern, md)
|
2012-11-13 21:28:10 +00:00
|
|
|
self.consumer = oembed_consumer
|
2026-03-02 16:25:43 +00:00
|
|
|
self.wrapper_class = wrapper_class
|
|
|
|
|
|
2026-03-03 13:26:52 +00:00
|
|
|
def handleMatch(self, m, data): # noqa: N802
|
2026-03-02 16:25:43 +00:00
|
|
|
url = m.group(2).strip()
|
|
|
|
|
alt = m.group(1)
|
2012-11-13 21:28:10 +00:00
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
# Skip image URLs — let Markdown's default image handler process them
|
|
|
|
|
if _is_image_url(url):
|
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
html = self._get_oembed_html(url, alt)
|
2012-11-14 18:15:41 +00:00
|
|
|
if html is None:
|
2026-03-02 16:25:43 +00:00
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
html = _sanitize_html(html)
|
|
|
|
|
if self.wrapper_class:
|
|
|
|
|
html = f'<figure class="{self.wrapper_class}">{html}</figure>'
|
|
|
|
|
|
|
|
|
|
# Stash raw HTML so it survives Markdown's escaping; place the
|
|
|
|
|
# placeholder inside an inline element that the tree-processor will
|
|
|
|
|
# later replace with the real HTML.
|
|
|
|
|
placeholder = self.md.htmlStash.store(html)
|
|
|
|
|
el = Element("span")
|
|
|
|
|
el.text = placeholder
|
|
|
|
|
return el, m.start(0), m.end(0)
|
2012-11-14 18:15:41 +00:00
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
def _get_oembed_html(self, url: str, alt: str = "") -> str | None:
|
|
|
|
|
"""Fetch oEmbed HTML for a URL, handling different response types."""
|
2026-03-03 13:26:52 +00:00
|
|
|
if self.consumer is None:
|
|
|
|
|
LOG.warning("No oEmbed consumer configured")
|
|
|
|
|
return None
|
2012-11-13 23:28:04 +00:00
|
|
|
try:
|
|
|
|
|
response = self.consumer.embed(url)
|
2026-03-03 13:26:52 +00:00
|
|
|
except OEmbedNoEndpoint:
|
2026-03-02 16:25:43 +00:00
|
|
|
LOG.warning("No oEmbed endpoint for URL: %s", url)
|
2012-11-13 23:28:04 +00:00
|
|
|
return None
|
2026-03-02 16:25:43 +00:00
|
|
|
except Exception:
|
|
|
|
|
LOG.exception("Error fetching oEmbed for URL: %s", url)
|
2020-04-07 12:37:38 +00:00
|
|
|
return None
|
2026-03-02 16:25:43 +00:00
|
|
|
|
|
|
|
|
# oEmbed 'video' and 'rich' types include an 'html' field
|
|
|
|
|
html = response.get("html")
|
|
|
|
|
if html:
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
# oEmbed 'photo' type — construct an <img> tag
|
|
|
|
|
photo_url = response.get("url")
|
|
|
|
|
if photo_url:
|
|
|
|
|
width = response.get("width", "")
|
|
|
|
|
height = response.get("height", "")
|
|
|
|
|
return (
|
2026-03-03 13:26:52 +00:00
|
|
|
f'<img src="{_html.escape(str(photo_url), quote=True)}"'
|
|
|
|
|
f' alt="{_html.escape(alt, quote=True)}"'
|
|
|
|
|
f' width="{_html.escape(str(width), quote=True)}"'
|
|
|
|
|
f' height="{_html.escape(str(height), quote=True)}" />'
|
2026-03-02 16:25:43 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
LOG.warning("oEmbed response for %s has no 'html' or 'url' field", url)
|
|
|
|
|
return None
|