mirror of
https://github.com/Hopiu/llm.git
synced 2026-03-31 11:50:23 +00:00
parent
523fc4f1a3
commit
321636e791
5 changed files with 302 additions and 1 deletions
|
|
@ -96,6 +96,7 @@ embeddings/index
|
|||
plugins/index
|
||||
aliases
|
||||
python-api
|
||||
schemas
|
||||
templates
|
||||
logging
|
||||
related-tools
|
||||
|
|
|
|||
101
docs/schemas.md
Normal file
101
docs/schemas.md
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
(schemas)=
|
||||
|
||||
# Schemas
|
||||
|
||||
Large Language Models are very good at producing structured output as JSON or other formats. LLM's **schemas** feature allows you to define the exact structure of JSON data you want to receive from a model.
|
||||
|
||||
This feature is supported by models from OpenAI, Anthropic, Google Gemini and others {ref}`via plugins <advanced-model-plugins-schemas>`.
|
||||
|
||||
## Understanding JSON schemas
|
||||
|
||||
A [JSON schema](https://json-schema.org/) is a specification that describes the expected structure of a JSON object. It defines:
|
||||
|
||||
- The data types of fields (string, number, array, object, etc.)
|
||||
- Required vs. optional fields
|
||||
- Nested data structures
|
||||
- Constraints on values (minimum/maximum, patterns, etc.)
|
||||
- Descriptions of those fields - these can be used to guide the language model
|
||||
|
||||
Different models may support different subsets of the overall JSON schema language. You should experiment to figure out what works for the model you are using.
|
||||
|
||||
## Using schemas with LLM
|
||||
|
||||
LLM provides several ways to use schemas:
|
||||
|
||||
1. Directly via the command line with the `--schema` option
|
||||
2. Through stored schemas in the database
|
||||
3. Via templates that include schemas
|
||||
4. Through the {ref}`Python API <python-api-schemas>`
|
||||
|
||||
### Basic usage with the command line
|
||||
|
||||
To get structured data from a language model you can provide a JSON schema directly using the `--schema` option:
|
||||
|
||||
```bash
|
||||
curl https://www.nytimes.com/ | uvx strip-tags | llm --schema '{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"stories": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headline": {
|
||||
"type": "string"
|
||||
},
|
||||
"short_summary": {
|
||||
"type": "string"
|
||||
},
|
||||
"key_people": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["headline", "summary", "key_people"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["stories"]
|
||||
}' | jq
|
||||
```
|
||||
This example uses [uvx](https://docs.astral.sh/uv/guides/tools/) to run [strip-tags](https://github.com/simonw/strip-tags) against the front page of the New York Times, runs GPT-4o mini with a schema to extract story headlines and summaries, then pipes the result through [jq](https://jqlang.org/) to format it.
|
||||
|
||||
This will instruct the model to return an array of JSON objects with the specified structure, each containing a headline, summary, and array of key people mentioned.
|
||||
|
||||
### Alternative schema syntax
|
||||
|
||||
JSON schema's can be time-consuming to construct by hand. LLM also supports a concise alternative syntax for specifying a schema.
|
||||
|
||||
A simple schema for an object with two string properties called `name` and `bio` looks like this:
|
||||
|
||||
name, bio
|
||||
|
||||
You can include type information by adding a type indicator after the property name, separated by a space.
|
||||
|
||||
name, bio, age int
|
||||
|
||||
Supported types are `int` for integers, `float` for floating point numbers, `str` for strings (the default) and `bool` for true/false booleans.
|
||||
|
||||
To include a description of the field to act as a hint to the model, add one after a colon:
|
||||
|
||||
name: the person's name, age int: their age, bio: a short bio
|
||||
|
||||
If your schema is getting long you can switch from comma-separated to newline-separated, which also allows you to use commas in those descriptions:
|
||||
|
||||
name: the person's name
|
||||
age int: their age
|
||||
bio: a short bio, no more than three sentences
|
||||
|
||||
This format is supported by the `--schema` option. The format will be detected any time you provide a string with at least one space that doesn't start with a `{` (indicating JSON):
|
||||
|
||||
```bash
|
||||
llm --schema 'name,description,fave_toy' 'invent a dog'
|
||||
```
|
||||
To return multiple items matching your schema, use the `--schema-multi` option. This is equivalent to using `--schema` with a JSON schema that specifies an `items` key containing multiple objects.
|
||||
|
||||
```bash
|
||||
llm --schema-multi 'name,description,fave_toy' 'invent 3 dogs'
|
||||
```
|
||||
The Python utility function `llm.utils.build_json_schema(schema)` can be used to convert this syntax into the equivalent JSON schema dictionary.
|
||||
67
llm/utils.py
67
llm/utils.py
|
|
@ -243,6 +243,9 @@ def resolve_schema_input(db, schema_input):
|
|||
return json.loads(schema_input)
|
||||
except ValueError:
|
||||
pass
|
||||
if " " in schema_input.strip() or "," in schema_input:
|
||||
# Treat it as schema DSL
|
||||
return build_json_schema(schema_input)
|
||||
# Is it a file on disk?
|
||||
path = pathlib.Path(schema_input)
|
||||
if path.exists():
|
||||
|
|
@ -298,3 +301,67 @@ def schema_summary(schema: dict) -> str:
|
|||
return schema_summary(items)
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def build_json_schema(schema_dsl: str) -> dict:
|
||||
"""
|
||||
Build a JSON schema from a concise schema string.
|
||||
|
||||
Args:
|
||||
schema_dsl: A string representing a schema in the concise format.
|
||||
Can be comma-separated or newline-separated.
|
||||
|
||||
Returns:
|
||||
A dictionary representing the JSON schema.
|
||||
"""
|
||||
# Type mapping dictionary
|
||||
type_mapping = {
|
||||
"int": "integer",
|
||||
"float": "number",
|
||||
"bool": "boolean",
|
||||
"str": "string",
|
||||
}
|
||||
|
||||
# Initialize the schema dictionary with required elements
|
||||
json_schema = {"type": "object", "properties": {}, "required": []}
|
||||
|
||||
# Check if the schema is newline-separated or comma-separated
|
||||
if "\n" in schema_dsl:
|
||||
fields = [field.strip() for field in schema_dsl.split("\n") if field.strip()]
|
||||
else:
|
||||
fields = [field.strip() for field in schema_dsl.split(",") if field.strip()]
|
||||
|
||||
# Process each field
|
||||
for field in fields:
|
||||
# Extract field name, type, and description
|
||||
if ":" in field:
|
||||
field_info, description = field.split(":", 1)
|
||||
description = description.strip()
|
||||
else:
|
||||
field_info = field
|
||||
description = ""
|
||||
|
||||
# Process field name and type
|
||||
field_parts = field_info.strip().split()
|
||||
field_name = field_parts[0].strip()
|
||||
|
||||
# Default type is string
|
||||
field_type = "string"
|
||||
|
||||
# If type is specified, use it
|
||||
if len(field_parts) > 1:
|
||||
type_indicator = field_parts[1].strip()
|
||||
if type_indicator in type_mapping:
|
||||
field_type = type_mapping[type_indicator]
|
||||
|
||||
# Add field to properties
|
||||
json_schema["properties"][field_name] = {"type": field_type}
|
||||
|
||||
# Add description if provided
|
||||
if description:
|
||||
json_schema["properties"][field_name]["description"] = description
|
||||
|
||||
# Add field to required list
|
||||
json_schema["required"].append(field_name)
|
||||
|
||||
return json_schema
|
||||
|
|
|
|||
|
|
@ -616,6 +616,26 @@ def test_schema_via_cli(mock_model, tmpdir, monkeypatch, use_filename):
|
|||
assert result2.exit_code == 0
|
||||
|
||||
|
||||
def test_schema_using_dsl(mock_model, tmpdir, monkeypatch):
|
||||
user_path = tmpdir / "user"
|
||||
mock_model.enqueue([json.dumps(dog)])
|
||||
monkeypatch.setenv("LLM_USER_PATH", str(user_path))
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
cli,
|
||||
["--schema", "name, age int", "prompt", "-m", "mock"],
|
||||
catch_exceptions=False,
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert result.output == '{"name": "Cleo", "age": 10}\n'
|
||||
rows = list(sqlite_utils.Database(str(user_path / "logs.db"))["schemas"].rows)
|
||||
assert json.loads(rows[0]["content"]) == {
|
||||
"type": "object",
|
||||
"properties": {"name": {"type": "string"}, "age": {"type": "integer"}},
|
||||
"required": ["name", "age"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("use_pydantic", (False, True))
|
||||
async def test_schema_async(async_mock_model, use_pydantic):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
from llm.utils import simplify_usage_dict, extract_fenced_code_block
|
||||
from llm.utils import simplify_usage_dict, extract_fenced_code_block, build_json_schema
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -101,3 +101,115 @@ def test_simplify_usage_dict(input_data, expected_output):
|
|||
def test_extract_fenced_code_block(input, last, expected):
|
||||
actual = extract_fenced_code_block(input, last=last)
|
||||
assert actual == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"schema, expected",
|
||||
[
|
||||
# Test case 1: Basic comma-separated fields, default string type
|
||||
(
|
||||
"name, bio",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {"name": {"type": "string"}, "bio": {"type": "string"}},
|
||||
"required": ["name", "bio"],
|
||||
},
|
||||
),
|
||||
# Test case 2: Comma-separated fields with types
|
||||
(
|
||||
"name, age int, balance float, active bool",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer"},
|
||||
"balance": {"type": "number"},
|
||||
"active": {"type": "boolean"},
|
||||
},
|
||||
"required": ["name", "age", "balance", "active"],
|
||||
},
|
||||
),
|
||||
# Test case 3: Comma-separated fields with descriptions
|
||||
(
|
||||
"name: full name, age int: years old",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "description": "full name"},
|
||||
"age": {"type": "integer", "description": "years old"},
|
||||
},
|
||||
"required": ["name", "age"],
|
||||
},
|
||||
),
|
||||
# Test case 4: Newline-separated fields
|
||||
(
|
||||
"""
|
||||
name
|
||||
bio
|
||||
age int
|
||||
""",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"bio": {"type": "string"},
|
||||
"age": {"type": "integer"},
|
||||
},
|
||||
"required": ["name", "bio", "age"],
|
||||
},
|
||||
),
|
||||
# Test case 5: Newline-separated with descriptions containing commas
|
||||
(
|
||||
"""
|
||||
name: the person's name
|
||||
age int: their age in years, must be positive
|
||||
bio: a short bio, no more than three sentences
|
||||
""",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "description": "the person's name"},
|
||||
"age": {
|
||||
"type": "integer",
|
||||
"description": "their age in years, must be positive",
|
||||
},
|
||||
"bio": {
|
||||
"type": "string",
|
||||
"description": "a short bio, no more than three sentences",
|
||||
},
|
||||
},
|
||||
"required": ["name", "age", "bio"],
|
||||
},
|
||||
),
|
||||
# Test case 6: Empty schema
|
||||
("", {"type": "object", "properties": {}, "required": []}),
|
||||
# Test case 7: Explicit string type
|
||||
(
|
||||
"name str, description str",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"description": {"type": "string"},
|
||||
},
|
||||
"required": ["name", "description"],
|
||||
},
|
||||
),
|
||||
# Test case 8: Extra whitespace
|
||||
(
|
||||
" name , age int : person's age ",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer", "description": "person's age"},
|
||||
},
|
||||
"required": ["name", "age"],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_build_json_schema(schema, expected):
|
||||
"""Test the build_json_schema function with various inputs."""
|
||||
result = build_json_schema(schema)
|
||||
assert result == expected
|
||||
|
|
|
|||
Loading…
Reference in a new issue