2012-11-13 21:28:10 +00:00
|
|
|
import logging
|
2026-03-02 16:25:43 +00:00
|
|
|
from posixpath import splitext
|
|
|
|
|
from urllib.parse import urlparse
|
2012-11-13 21:28:10 +00:00
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
import nh3
|
|
|
|
|
import oembed
|
|
|
|
|
from markdown.inlinepatterns import InlineProcessor
|
|
|
|
|
from xml.etree.ElementTree import Element
|
2012-11-13 21:28:10 +00:00
|
|
|
|
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
# Image extensions to exclude from oEmbed processing
|
|
|
|
|
_IMAGE_EXTENSIONS = frozenset({
|
|
|
|
|
".png", ".jpg", ".jpeg", ".gif", ".avif", ".webp",
|
|
|
|
|
".svg", ".bmp", ".tiff", ".ico",
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
# Matches Markdown image syntax with an absolute URL: 
|
|
|
|
|
OEMBED_LINK_RE = r"!\[([^\]]*)\]\(((?:https?:)?//[^\)]+)\)"
|
|
|
|
|
|
|
|
|
|
# Allowed HTML tags and attributes for sanitizing oEmbed responses
|
|
|
|
|
_SANITIZE_TAGS = {"iframe", "video", "audio", "source", "img", "blockquote", "div", "p", "a", "span", "figure"}
|
|
|
|
|
_SANITIZE_ATTRS = {
|
|
|
|
|
"*": {"class", "style", "title"},
|
|
|
|
|
"iframe": {"src", "width", "height", "frameborder", "allowfullscreen", "allow", "referrerpolicy", "sandbox"},
|
|
|
|
|
"video": {"src", "width", "height", "controls", "autoplay", "loop", "muted", "poster", "preload"},
|
|
|
|
|
"audio": {"src", "controls", "autoplay", "loop", "muted", "preload"},
|
|
|
|
|
"source": {"src", "type"},
|
|
|
|
|
"img": {"src", "alt", "width", "height", "loading"},
|
|
|
|
|
"a": {"href", "target"},
|
|
|
|
|
}
|
2012-11-13 21:28:10 +00:00
|
|
|
|
|
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
def _is_image_url(url: str) -> bool:
|
|
|
|
|
"""Check if a URL points to an image based on its path extension."""
|
|
|
|
|
try:
|
|
|
|
|
path = urlparse(url).path
|
|
|
|
|
_, ext = splitext(path)
|
|
|
|
|
return ext.lower() in _IMAGE_EXTENSIONS
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
2012-11-13 21:28:10 +00:00
|
|
|
|
|
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
def _sanitize_html(html: str) -> str:
|
|
|
|
|
"""Sanitize oEmbed HTML to prevent XSS."""
|
|
|
|
|
return nh3.clean(html, tags=_SANITIZE_TAGS, attributes=_SANITIZE_ATTRS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class OEmbedLinkPattern(InlineProcessor):
|
|
|
|
|
"""Inline processor that replaces Markdown image links with oEmbed content."""
|
|
|
|
|
|
|
|
|
|
def __init__(self, pattern, md=None, oembed_consumer=None, wrapper_class="oembed"):
|
|
|
|
|
super().__init__(pattern, md)
|
2012-11-13 21:28:10 +00:00
|
|
|
self.consumer = oembed_consumer
|
2026-03-02 16:25:43 +00:00
|
|
|
self.wrapper_class = wrapper_class
|
|
|
|
|
|
|
|
|
|
def handleMatch(self, m, data):
|
|
|
|
|
url = m.group(2).strip()
|
|
|
|
|
alt = m.group(1)
|
2012-11-13 21:28:10 +00:00
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
# Skip image URLs — let Markdown's default image handler process them
|
|
|
|
|
if _is_image_url(url):
|
|
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
html = self._get_oembed_html(url, alt)
|
2012-11-14 18:15:41 +00:00
|
|
|
if html is None:
|
2026-03-02 16:25:43 +00:00
|
|
|
return None, None, None
|
|
|
|
|
|
|
|
|
|
html = _sanitize_html(html)
|
|
|
|
|
if self.wrapper_class:
|
|
|
|
|
html = f'<figure class="{self.wrapper_class}">{html}</figure>'
|
|
|
|
|
|
|
|
|
|
# Stash raw HTML so it survives Markdown's escaping; place the
|
|
|
|
|
# placeholder inside an inline element that the tree-processor will
|
|
|
|
|
# later replace with the real HTML.
|
|
|
|
|
placeholder = self.md.htmlStash.store(html)
|
|
|
|
|
el = Element("span")
|
|
|
|
|
el.text = placeholder
|
|
|
|
|
return el, m.start(0), m.end(0)
|
2012-11-14 18:15:41 +00:00
|
|
|
|
2026-03-02 16:25:43 +00:00
|
|
|
def _get_oembed_html(self, url: str, alt: str = "") -> str | None:
|
|
|
|
|
"""Fetch oEmbed HTML for a URL, handling different response types."""
|
2012-11-13 23:28:04 +00:00
|
|
|
try:
|
|
|
|
|
response = self.consumer.embed(url)
|
|
|
|
|
except oembed.OEmbedNoEndpoint:
|
2026-03-02 16:25:43 +00:00
|
|
|
LOG.warning("No oEmbed endpoint for URL: %s", url)
|
2012-11-13 23:28:04 +00:00
|
|
|
return None
|
2026-03-02 16:25:43 +00:00
|
|
|
except Exception:
|
|
|
|
|
LOG.exception("Error fetching oEmbed for URL: %s", url)
|
2020-04-07 12:37:38 +00:00
|
|
|
return None
|
2026-03-02 16:25:43 +00:00
|
|
|
|
|
|
|
|
# oEmbed 'video' and 'rich' types include an 'html' field
|
|
|
|
|
html = response.get("html")
|
|
|
|
|
if html:
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
# oEmbed 'photo' type — construct an <img> tag
|
|
|
|
|
photo_url = response.get("url")
|
|
|
|
|
if photo_url:
|
|
|
|
|
width = response.get("width", "")
|
|
|
|
|
height = response.get("height", "")
|
|
|
|
|
escaped_alt = alt.replace('"', """)
|
|
|
|
|
return (
|
|
|
|
|
f'<img src="{photo_url}" alt="{escaped_alt}"'
|
|
|
|
|
f' width="{width}" height="{height}" />'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
LOG.warning("oEmbed response for %s has no 'html' or 'url' field", url)
|
|
|
|
|
return None
|