refactor(slug): ensure generated slugs are valid Python identifiers

This commit is contained in:
Mike 2024-08-31 18:12:45 -07:00
parent ab23ba118d
commit 6f141ff4f2
2 changed files with 96 additions and 7 deletions

View file

@ -1,3 +1,5 @@
from __future__ import annotations
import secrets
import string
from typing import Final
@ -6,17 +8,53 @@ from django.utils.text import slugify
SLUGFIELD_MAX_LENGTH: Final = 50
def non_identifier_chars() -> dict[str, str]:
"""Generate a mapping of non-identifier characters to their Unicode representations.
def generate_slug(name: str) -> str:
"""Generates a valid slug based on ``name``."""
slug = slugify(name, allow_unicode=False)
Returns:
dict[str, str]: A dictionary where keys are special characters and values
are their Unicode representations.
"""
# Start with all printable characters
all_chars = string.printable
# Filter out characters that are valid in Python identifiers
special_chars = [
char for char in all_chars
if not char.isalnum() and char not in ["_", " "]
]
return {char: f"u{ord(char):04x}" for char in special_chars}
def generate_slug(value: str) -> str:
"""Generate a valid slug based on the given value.
This function converts the input value into a Python-identifier-friendly slug.
It handles special characters, ensures a valid Python identifier, and truncates
the result to fit within the maximum allowed length.
Args:
value (str): The input string to generate a slug from.
Returns:
str: A valid Python identifier slug, with a maximum
length of SLUGFIELD_MAX_LENGTH.
"""
for char, replacement in non_identifier_chars().items():
value = value.replace(char, replacement)
# Use slugify to create a URL-friendly base slug.
slug = slugify(value, allow_unicode=False).replace("-", "_")
# If slugify returns an empty string, generate a fallback
# slug to ensure it's never empty.
if not slug:
# Fallback to ensure a slug is always generated by using a random one
chars = string.ascii_lowercase + string.digits
randstr = ''.join(secrets.choice(chars) for _ in range(8))
slug = 'rand-{0}'.format(randstr)
randstr = "".join(secrets.choice(chars) for _ in range(8))
slug = f"rand_{randstr}"
slug = slug.encode('utf-8', 'surrogateescape').decode()
# Ensure the slug doesn't start with a digit to make it a valid Python identifier.
if slug[0].isdigit():
slug = "_" + slug
return slug[:SLUGFIELD_MAX_LENGTH]

View file

@ -1,3 +1,4 @@
import pytest
from hypothesis import given
from hypothesis import strategies as st
@ -18,3 +19,53 @@ def test_generate_long_slug_text(name: str) -> None:
slug = generate_slug(name)
assert len(slug) <= SLUGFIELD_MAX_LENGTH
def test_generate_slug_uniqueness() -> None:
"""Test that generate_slug() produces unique slugs for different inputs.
This test ensures that even similar inputs result in unique slugs,
and that the number of unique slugs matches the number of inputs.
"""
inputs = ["age #", "age %", "age $", "age @", "age!", "age?", "age 😊"]
generated_slugs: dict[str, str] = {}
for input_str in inputs:
slug = generate_slug(input_str)
assert slug not in generated_slugs.values(), \
f"Duplicate slug '{slug}' generated for input '{input_str}'"
generated_slugs[input_str] = slug
assert len(generated_slugs) == len(inputs), \
"Number of unique slugs doesn't match number of inputs"
@pytest.mark.parametrize("input_str", [
"01 age",
"? age",
"age 😊",
"class",
"def function",
"2nd place",
"@username",
"user-name",
"first.last",
"snake_case",
"CamelCase",
" " # Empty
])
def test_generate_slug_valid_identifier(input_str: str) -> None:
"""Test that generate_slug() produces valid Python identifiers.
This test ensures that the generated slugs are valid Python identifiers
for a variety of input strings, including those with numbers, special
characters, emojis, and different naming conventions.
Args:
input_str (str): The input string to test.
"""
slug = generate_slug(input_str)
assert slug.isidentifier(), (
f"Generated slug '{slug}' for input '{input_str}' "
"is not a valid Python identifier"
)