python-markdown-oembed/mdx_oembed/oembed.py
Benedikt Willi 9e48edcfdf Refactor and Update python-markdown-oembed-extension
- **Removed legacy build files**:
  - Deleted obsolete `.travis.yml`, `flake.lock`, and `flake.nix` files that were no longer needed for the current build and dependency management setup.

- **Updated versioning**:
  - Incremented the package version from `0.4.0` to `0.5.0` in `version.py` and made adjustments in `pyproject.toml` to reflect the new versioning mechanism.

- **Refined package structure**:
  - Moved source files from `src/python_markdown_oembed_extension` to `mdx_oembed` and renamed references accordingly for better clarity and organization of the codebase.

- **Enhanced OEmbed functionality**:
  - Added dedicated endpoint handling in the new `endpoints.py`.
  - Refactored the `oembed.py` file to implement a minimal oEmbed consumer, replacing the earlier dependency on `python-oembed`.

- **Improved test coverage**:
  - Transitioned tests from `unittest` to `pytest` framework for better maintainability.
  - Expanded unit tests, including better error handling and validation for various media types.

- **Updated dependency requirements**:
  - Raised minimum Python version from `3.9` to `3.12` in `pyproject.toml`.
  - Removed non-essential dependencies and restructured the dependency declarations to streamline package management.

These changes focus on modernizing the codebase, improving adherence to current Python standards, and enhancing overall functionality and maintainability.
2026-03-03 14:26:52 +01:00

181 lines
6.1 KiB
Python

"""Minimal oEmbed consumer — replaces the python-oembed dependency.
Implements just the subset used by this extension:
- OEmbedEndpoint: pairs an API URL with URL-glob patterns
- OEmbedConsumer: resolves a URL against registered endpoints and
fetches the oEmbed JSON response
- OEmbedError / OEmbedNoEndpoint: exception hierarchy
"""
from __future__ import annotations
import fnmatch
import json
import logging
import re
import warnings
from typing import Any
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from mdx_oembed.version import __version__
__all__ = [
"OEmbedEndpoint",
"OEmbedConsumer",
"OEmbedError",
"OEmbedNoEndpoint",
"REQUEST_TIMEOUT",
]
LOG = logging.getLogger(__name__)
# Default timeout (seconds) for outbound oEmbed HTTP requests.
REQUEST_TIMEOUT = 10
_USER_AGENT = f"python-markdown-oembed/{__version__}"
# Pre-compiled regex for the ``https?://`` scheme shorthand used in oEmbed
# URL patterns. Kept at module level to avoid re-creation on every call.
_SCHEME_RE = re.compile(r"https\?://")
_SCHEME_PLACEHOLDER = "__SCHEME__"
# -- Exceptions -------------------------------------------------------------
class OEmbedError(Exception):
"""Base exception for oEmbed errors."""
class OEmbedNoEndpoint(OEmbedError): # noqa: N818
"""Raised when no registered endpoint matches the requested URL."""
# -- Endpoint ---------------------------------------------------------------
class OEmbedEndpoint:
"""An oEmbed provider endpoint.
Parameters
----------
api_url:
The provider's oEmbed API URL (e.g. ``https://www.youtube.com/oembed``).
url_patterns:
Shell-style glob patterns (with ``https?://`` shorthand) that describe
which content URLs this endpoint handles. The ``?`` in ``https?``
is treated specially: it makes the preceding ``s`` optional so a single
pattern can match both ``http`` and ``https``.
"""
def __init__(self, api_url: str, url_patterns: list[str]) -> None:
self.api_url = api_url
self.url_patterns = url_patterns
self._regexes: list[re.Pattern[str]] = [
self._compile(p) for p in url_patterns
]
def __repr__(self) -> str:
return f"OEmbedEndpoint({self.api_url!r}, {self.url_patterns!r})"
# -- internal helpers ----------------------------------------------------
@staticmethod
def _compile(pattern: str) -> re.Pattern[str]:
"""Convert a URL-glob pattern to a compiled regex.
Handles the ``https?://`` convention used by oEmbed providers:
the ``s`` before ``?`` is made optional *before* the rest of the
pattern is translated via `fnmatch`.
"""
converted = _SCHEME_RE.sub(_SCHEME_PLACEHOLDER, pattern)
# fnmatch.translate anchors with \\A … \\Z and handles */?/[] globs.
regex = fnmatch.translate(converted)
# Put the scheme alternation back.
regex = regex.replace(_SCHEME_PLACEHOLDER, r"https?://")
return re.compile(regex, re.IGNORECASE)
def matches(self, url: str) -> bool:
"""Return True if *url* matches any of this endpoint's patterns."""
return any(r.match(url) for r in self._regexes)
# -- Consumer ---------------------------------------------------------------
class OEmbedConsumer:
"""Registry of `OEmbedEndpoint` objects that can resolve arbitrary URLs.
Parameters
----------
timeout:
HTTP request timeout in seconds. Defaults to :data:`REQUEST_TIMEOUT`.
"""
def __init__(self, timeout: int = REQUEST_TIMEOUT) -> None:
self._endpoints: list[OEmbedEndpoint] = []
self.timeout = timeout
def __repr__(self) -> str:
names = [ep.api_url for ep in self._endpoints]
return f"OEmbedConsumer(endpoints={names!r})"
def add_endpoint(self, endpoint: OEmbedEndpoint) -> None:
"""Register an oEmbed endpoint."""
self._endpoints.append(endpoint)
def addEndpoint(self, endpoint: OEmbedEndpoint) -> None: # noqa: N802
"""Deprecated alias for :meth:`add_endpoint`."""
warnings.warn(
"addEndpoint() is deprecated, use add_endpoint() instead",
DeprecationWarning,
stacklevel=2,
)
self.add_endpoint(endpoint)
def embed(self, url: str) -> dict[str, Any]:
"""Fetch the oEmbed response for *url*.
Returns the parsed JSON as a ``dict``.
Raises
------
OEmbedNoEndpoint
If none of the registered endpoints match *url*.
OEmbedError
On HTTP or JSON-parsing failures.
"""
endpoint = self._find_endpoint(url)
if endpoint is None:
raise OEmbedNoEndpoint(f"No oEmbed endpoint registered for {url}")
return self._fetch(endpoint, url)
# -- internal helpers ----------------------------------------------------
def _find_endpoint(self, url: str) -> OEmbedEndpoint | None:
for ep in self._endpoints:
if ep.matches(url):
return ep
return None
def _fetch(self, endpoint: OEmbedEndpoint, content_url: str) -> dict[str, Any]:
params = urlencode({"url": content_url, "format": "json"})
api_url = f"{endpoint.api_url}?{params}"
request = Request(api_url, headers={ # noqa: S310
"Accept": "application/json",
"User-Agent": _USER_AGENT,
})
LOG.debug("Fetching oEmbed: %s", api_url)
try:
with urlopen(request, timeout=self.timeout) as resp: # noqa: S310
if resp.status is not None and not (200 <= resp.status < 300):
raise OEmbedError(
f"oEmbed request for {content_url} returned HTTP {resp.status}"
)
charset = resp.headers.get_content_charset() or "utf-8"
data: dict[str, Any] = json.loads(resp.read().decode(charset))
except OEmbedError:
raise
except Exception as exc:
raise OEmbedError(
f"Failed to fetch oEmbed for {content_url}: {exc}"
) from exc
return data