diff --git a/docs/index.md b/docs/index.md index 90993a4..2616574 100644 --- a/docs/index.md +++ b/docs/index.md @@ -96,6 +96,7 @@ embeddings/index plugins/index aliases python-api +schemas templates logging related-tools diff --git a/docs/schemas.md b/docs/schemas.md new file mode 100644 index 0000000..1889b46 --- /dev/null +++ b/docs/schemas.md @@ -0,0 +1,101 @@ +(schemas)= + +# Schemas + +Large Language Models are very good at producing structured output as JSON or other formats. LLM's **schemas** feature allows you to define the exact structure of JSON data you want to receive from a model. + +This feature is supported by models from OpenAI, Anthropic, Google Gemini and others {ref}`via plugins `. + +## Understanding JSON schemas + +A [JSON schema](https://json-schema.org/) is a specification that describes the expected structure of a JSON object. It defines: + +- The data types of fields (string, number, array, object, etc.) +- Required vs. optional fields +- Nested data structures +- Constraints on values (minimum/maximum, patterns, etc.) +- Descriptions of those fields - these can be used to guide the language model + +Different models may support different subsets of the overall JSON schema language. You should experiment to figure out what works for the model you are using. + +## Using schemas with LLM + +LLM provides several ways to use schemas: + +1. Directly via the command line with the `--schema` option +2. Through stored schemas in the database +3. Via templates that include schemas +4. Through the {ref}`Python API ` + +### Basic usage with the command line + +To get structured data from a language model you can provide a JSON schema directly using the `--schema` option: + +```bash +curl https://www.nytimes.com/ | uvx strip-tags | llm --schema '{ + "type": "object", + "properties": { + "stories": { + "type": "array", + "items": { + "type": "object", + "properties": { + "headline": { + "type": "string" + }, + "short_summary": { + "type": "string" + }, + "key_people": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["headline", "summary", "key_people"] + } + } + }, + "required": ["stories"] +}' | jq +``` +This example uses [uvx](https://docs.astral.sh/uv/guides/tools/) to run [strip-tags](https://github.com/simonw/strip-tags) against the front page of the New York Times, runs GPT-4o mini with a schema to extract story headlines and summaries, then pipes the result through [jq](https://jqlang.org/) to format it. + +This will instruct the model to return an array of JSON objects with the specified structure, each containing a headline, summary, and array of key people mentioned. + +### Alternative schema syntax + +JSON schema's can be time-consuming to construct by hand. LLM also supports a concise alternative syntax for specifying a schema. + +A simple schema for an object with two string properties called `name` and `bio` looks like this: + + name, bio + +You can include type information by adding a type indicator after the property name, separated by a space. + + name, bio, age int + +Supported types are `int` for integers, `float` for floating point numbers, `str` for strings (the default) and `bool` for true/false booleans. + +To include a description of the field to act as a hint to the model, add one after a colon: + + name: the person's name, age int: their age, bio: a short bio + +If your schema is getting long you can switch from comma-separated to newline-separated, which also allows you to use commas in those descriptions: + + name: the person's name + age int: their age + bio: a short bio, no more than three sentences + +This format is supported by the `--schema` option. The format will be detected any time you provide a string with at least one space that doesn't start with a `{` (indicating JSON): + +```bash +llm --schema 'name,description,fave_toy' 'invent a dog' +``` +To return multiple items matching your schema, use the `--schema-multi` option. This is equivalent to using `--schema` with a JSON schema that specifies an `items` key containing multiple objects. + +```bash +llm --schema-multi 'name,description,fave_toy' 'invent 3 dogs' +``` +The Python utility function `llm.utils.build_json_schema(schema)` can be used to convert this syntax into the equivalent JSON schema dictionary. diff --git a/llm/utils.py b/llm/utils.py index a7d4a2c..7965f56 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -243,6 +243,9 @@ def resolve_schema_input(db, schema_input): return json.loads(schema_input) except ValueError: pass + if " " in schema_input.strip() or "," in schema_input: + # Treat it as schema DSL + return build_json_schema(schema_input) # Is it a file on disk? path = pathlib.Path(schema_input) if path.exists(): @@ -298,3 +301,67 @@ def schema_summary(schema: dict) -> str: return schema_summary(items) return "" + + +def build_json_schema(schema_dsl: str) -> dict: + """ + Build a JSON schema from a concise schema string. + + Args: + schema_dsl: A string representing a schema in the concise format. + Can be comma-separated or newline-separated. + + Returns: + A dictionary representing the JSON schema. + """ + # Type mapping dictionary + type_mapping = { + "int": "integer", + "float": "number", + "bool": "boolean", + "str": "string", + } + + # Initialize the schema dictionary with required elements + json_schema = {"type": "object", "properties": {}, "required": []} + + # Check if the schema is newline-separated or comma-separated + if "\n" in schema_dsl: + fields = [field.strip() for field in schema_dsl.split("\n") if field.strip()] + else: + fields = [field.strip() for field in schema_dsl.split(",") if field.strip()] + + # Process each field + for field in fields: + # Extract field name, type, and description + if ":" in field: + field_info, description = field.split(":", 1) + description = description.strip() + else: + field_info = field + description = "" + + # Process field name and type + field_parts = field_info.strip().split() + field_name = field_parts[0].strip() + + # Default type is string + field_type = "string" + + # If type is specified, use it + if len(field_parts) > 1: + type_indicator = field_parts[1].strip() + if type_indicator in type_mapping: + field_type = type_mapping[type_indicator] + + # Add field to properties + json_schema["properties"][field_name] = {"type": field_type} + + # Add description if provided + if description: + json_schema["properties"][field_name]["description"] = description + + # Add field to required list + json_schema["required"].append(field_name) + + return json_schema diff --git a/tests/test_llm.py b/tests/test_llm.py index 5835300..a5e4903 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -616,6 +616,26 @@ def test_schema_via_cli(mock_model, tmpdir, monkeypatch, use_filename): assert result2.exit_code == 0 +def test_schema_using_dsl(mock_model, tmpdir, monkeypatch): + user_path = tmpdir / "user" + mock_model.enqueue([json.dumps(dog)]) + monkeypatch.setenv("LLM_USER_PATH", str(user_path)) + runner = CliRunner() + result = runner.invoke( + cli, + ["--schema", "name, age int", "prompt", "-m", "mock"], + catch_exceptions=False, + ) + assert result.exit_code == 0 + assert result.output == '{"name": "Cleo", "age": 10}\n' + rows = list(sqlite_utils.Database(str(user_path / "logs.db"))["schemas"].rows) + assert json.loads(rows[0]["content"]) == { + "type": "object", + "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, + "required": ["name", "age"], + } + + @pytest.mark.asyncio @pytest.mark.parametrize("use_pydantic", (False, True)) async def test_schema_async(async_mock_model, use_pydantic): diff --git a/tests/test_utils.py b/tests/test_utils.py index ef4b805..04dbcf1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ import pytest -from llm.utils import simplify_usage_dict, extract_fenced_code_block +from llm.utils import simplify_usage_dict, extract_fenced_code_block, build_json_schema @pytest.mark.parametrize( @@ -101,3 +101,115 @@ def test_simplify_usage_dict(input_data, expected_output): def test_extract_fenced_code_block(input, last, expected): actual = extract_fenced_code_block(input, last=last) assert actual == expected + + +@pytest.mark.parametrize( + "schema, expected", + [ + # Test case 1: Basic comma-separated fields, default string type + ( + "name, bio", + { + "type": "object", + "properties": {"name": {"type": "string"}, "bio": {"type": "string"}}, + "required": ["name", "bio"], + }, + ), + # Test case 2: Comma-separated fields with types + ( + "name, age int, balance float, active bool", + { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + "balance": {"type": "number"}, + "active": {"type": "boolean"}, + }, + "required": ["name", "age", "balance", "active"], + }, + ), + # Test case 3: Comma-separated fields with descriptions + ( + "name: full name, age int: years old", + { + "type": "object", + "properties": { + "name": {"type": "string", "description": "full name"}, + "age": {"type": "integer", "description": "years old"}, + }, + "required": ["name", "age"], + }, + ), + # Test case 4: Newline-separated fields + ( + """ + name + bio + age int + """, + { + "type": "object", + "properties": { + "name": {"type": "string"}, + "bio": {"type": "string"}, + "age": {"type": "integer"}, + }, + "required": ["name", "bio", "age"], + }, + ), + # Test case 5: Newline-separated with descriptions containing commas + ( + """ + name: the person's name + age int: their age in years, must be positive + bio: a short bio, no more than three sentences + """, + { + "type": "object", + "properties": { + "name": {"type": "string", "description": "the person's name"}, + "age": { + "type": "integer", + "description": "their age in years, must be positive", + }, + "bio": { + "type": "string", + "description": "a short bio, no more than three sentences", + }, + }, + "required": ["name", "age", "bio"], + }, + ), + # Test case 6: Empty schema + ("", {"type": "object", "properties": {}, "required": []}), + # Test case 7: Explicit string type + ( + "name str, description str", + { + "type": "object", + "properties": { + "name": {"type": "string"}, + "description": {"type": "string"}, + }, + "required": ["name", "description"], + }, + ), + # Test case 8: Extra whitespace + ( + " name , age int : person's age ", + { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer", "description": "person's age"}, + }, + "required": ["name", "age"], + }, + ), + ], +) +def test_build_json_schema(schema, expected): + """Test the build_json_schema function with various inputs.""" + result = build_json_schema(schema) + assert result == expected