python-markdown-oembed/mdx_oembed/inlinepatterns.py
Benedikt Willi 9e48edcfdf Refactor and Update python-markdown-oembed-extension
- **Removed legacy build files**:
  - Deleted obsolete `.travis.yml`, `flake.lock`, and `flake.nix` files that were no longer needed for the current build and dependency management setup.

- **Updated versioning**:
  - Incremented the package version from `0.4.0` to `0.5.0` in `version.py` and made adjustments in `pyproject.toml` to reflect the new versioning mechanism.

- **Refined package structure**:
  - Moved source files from `src/python_markdown_oembed_extension` to `mdx_oembed` and renamed references accordingly for better clarity and organization of the codebase.

- **Enhanced OEmbed functionality**:
  - Added dedicated endpoint handling in the new `endpoints.py`.
  - Refactored the `oembed.py` file to implement a minimal oEmbed consumer, replacing the earlier dependency on `python-oembed`.

- **Improved test coverage**:
  - Transitioned tests from `unittest` to `pytest` framework for better maintainability.
  - Expanded unit tests, including better error handling and validation for various media types.

- **Updated dependency requirements**:
  - Raised minimum Python version from `3.9` to `3.12` in `pyproject.toml`.
  - Removed non-essential dependencies and restructured the dependency declarations to streamline package management.

These changes focus on modernizing the codebase, improving adherence to current Python standards, and enhancing overall functionality and maintainability.
2026-03-03 14:26:52 +01:00

135 lines
4.4 KiB
Python

from __future__ import annotations
import html as _html
import logging
from os.path import splitext
from urllib.parse import urlparse
from xml.etree.ElementTree import Element
import markdown
import nh3
from markdown.inlinepatterns import InlineProcessor
from mdx_oembed.oembed import OEmbedConsumer, OEmbedNoEndpoint
LOG = logging.getLogger(__name__)
# Image extensions to exclude from oEmbed processing
_IMAGE_EXTENSIONS = frozenset({
".png", ".jpg", ".jpeg", ".gif", ".avif", ".webp",
".svg", ".bmp", ".tiff", ".ico",
})
# Matches Markdown image syntax with an absolute URL: ![alt](https://...)
OEMBED_LINK_RE = r"!\[([^\]]*)\]\(((?:https?:)?//[^\)]+)\)"
# Allowed HTML tags and attributes for sanitizing oEmbed responses
_SANITIZE_TAGS = {
"iframe", "video", "audio", "source", "img",
"blockquote", "div", "p", "a", "span", "figure",
}
_SANITIZE_ATTRS = {
"*": {"class", "style", "title"},
"iframe": {
"src", "width", "height", "frameborder",
"allowfullscreen", "allow", "referrerpolicy", "sandbox",
},
"video": {
"src", "width", "height", "controls",
"autoplay", "loop", "muted", "poster", "preload",
},
"audio": {
"src", "controls", "autoplay", "loop", "muted", "preload",
},
"source": {"src", "type"},
"img": {"src", "alt", "width", "height", "loading"},
"a": {"href", "target"},
}
def _is_image_url(url: str) -> bool:
"""Check if a URL points to an image based on its path extension."""
try:
path = urlparse(url).path
_, ext = splitext(path)
return ext.lower() in _IMAGE_EXTENSIONS
except Exception:
return False
def _sanitize_html(html: str) -> str:
"""Sanitize oEmbed HTML to prevent XSS."""
return nh3.clean(html, tags=_SANITIZE_TAGS, attributes=_SANITIZE_ATTRS)
class OEmbedLinkPattern(InlineProcessor):
"""Inline processor that replaces Markdown image links with oEmbed content."""
def __init__(
self,
pattern: str,
md: markdown.Markdown | None = None,
oembed_consumer: OEmbedConsumer | None = None,
wrapper_class: str = "oembed",
) -> None:
super().__init__(pattern, md)
self.consumer = oembed_consumer
self.wrapper_class = wrapper_class
def handleMatch(self, m, data): # noqa: N802
url = m.group(2).strip()
alt = m.group(1)
# Skip image URLs — let Markdown's default image handler process them
if _is_image_url(url):
return None, None, None
html = self._get_oembed_html(url, alt)
if html is None:
return None, None, None
html = _sanitize_html(html)
if self.wrapper_class:
html = f'<figure class="{self.wrapper_class}">{html}</figure>'
# Stash raw HTML so it survives Markdown's escaping; place the
# placeholder inside an inline element that the tree-processor will
# later replace with the real HTML.
placeholder = self.md.htmlStash.store(html)
el = Element("span")
el.text = placeholder
return el, m.start(0), m.end(0)
def _get_oembed_html(self, url: str, alt: str = "") -> str | None:
"""Fetch oEmbed HTML for a URL, handling different response types."""
if self.consumer is None:
LOG.warning("No oEmbed consumer configured")
return None
try:
response = self.consumer.embed(url)
except OEmbedNoEndpoint:
LOG.warning("No oEmbed endpoint for URL: %s", url)
return None
except Exception:
LOG.exception("Error fetching oEmbed for URL: %s", url)
return None
# oEmbed 'video' and 'rich' types include an 'html' field
html = response.get("html")
if html:
return html
# oEmbed 'photo' type — construct an <img> tag
photo_url = response.get("url")
if photo_url:
width = response.get("width", "")
height = response.get("height", "")
return (
f'<img src="{_html.escape(str(photo_url), quote=True)}"'
f' alt="{_html.escape(alt, quote=True)}"'
f' width="{_html.escape(str(width), quote=True)}"'
f' height="{_html.escape(str(height), quote=True)}" />'
)
LOG.warning("oEmbed response for %s has no 'html' or 'url' field", url)
return None