llm logs --data-ids flag, closes #800

This commit is contained in:
Simon Willison 2025-02-27 20:31:50 -08:00
parent 1bebf8b34a
commit 48f67f4085
5 changed files with 92 additions and 4 deletions

View file

@ -307,6 +307,7 @@ Options:
--data Output newline-delimited JSON data for schema
--data-array Output JSON array of data for schema
--data-key TEXT Return JSON objects from array in this key
--data-ids Attach corresponding IDs to JSON objects
-t, --truncate Truncate long strings in output
-s, --short Shorter YAML output with truncated prompts
-u, --usage Include token usage

View file

@ -207,4 +207,18 @@ Output:
{"name": "Ziggy", "ten_word_bio": "Quirky pug who loves belly rubs and quirky outfits."},
{"name": "Robo", "ten_word_bio": "A cybernetic dog with laser eyes and super intelligence."},
{"name": "Flamepaw", "ten_word_bio": "Fire-resistant dog with a talent for agility and tricks."}]
```
```
Add `--data-ids` to include `"response_id"` and `"conversation_id"` fields in each of the returned objects reflecting the database IDs of the response and conversation they were a part of. This can be useful for tracking the source of each individual row.
```bash
llm logs --schema-multi 'name, ten_word_bio' --data-key items --data-ids
```
Output:
```json
{"name": "Nebula", "ten_word_bio": "A cosmic puppy with starry fur, loves adventures in space.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"}
{"name": "Echo", "ten_word_bio": "A clever hound with extraordinary hearing, master of hide-and-seek.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"}
{"name": "Biscuit", "ten_word_bio": "An adorable chef dog, bakes treats that everyone loves.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"}
{"name": "Cosmo", "ten_word_bio": "Galactic explorer, loves adventures and chasing shooting stars.", "response_id": "01jn4daycb3svj0x7kvp7zrp4q", "conversation_id": "01jn4daycb3svj0x7kvp7zrp4q"}
{"name": "Pixel", "ten_word_bio": "Tech-savvy pup, builds gadgets and loves virtual playtime.", "response_id": "01jn4daycb3svj0x7kvp7zrp4q", "conversation_id": "01jn4daycb3svj0x7kvp7zrp4q"}
```
If a row already has a property called `"conversation_id"` or `"response_id"` additional underscores will be appended to the ID key until it no longer overlaps with the existing keys.

View file

@ -48,6 +48,7 @@ from .utils import (
schema_summary,
multi_schema,
schema_dsl,
find_unused_key,
)
import base64
import httpx
@ -939,6 +940,9 @@ order by prompt_attachments."order"
)
@click.option("--data-array", is_flag=True, help="Output JSON array of data for schema")
@click.option("--data-key", help="Return JSON objects from array in this key")
@click.option(
"--data-ids", is_flag=True, help="Attach corresponding IDs to JSON objects"
)
@click.option("-t", "--truncate", is_flag=True, help="Truncate long strings in output")
@click.option(
"-s", "--short", is_flag=True, help="Shorter YAML output with truncated prompts"
@ -983,6 +987,7 @@ def logs_list(
data,
data_array,
data_key,
data_ids,
truncate,
short,
usage,
@ -1099,22 +1104,28 @@ def logs_list(
for attachment in attachments:
attachments_by_id.setdefault(attachment["response_id"], []).append(attachment)
if data or data_array or data_key:
if data or data_array or data_key or data_ids:
# Special case for --data to output valid JSON
to_output = []
for row in rows:
response = row["response"] or ""
try:
decoded = json.loads(response)
new_items = []
if (
isinstance(decoded, dict)
and (data_key in decoded)
and all(isinstance(item, dict) for item in decoded[data_key])
):
for item in decoded[data_key]:
to_output.append(item)
new_items.append(item)
else:
to_output.append(decoded)
new_items.append(decoded)
if data_ids:
for item in new_items:
item[find_unused_key(item, "response_id")] = row["id"]
item[find_unused_key(item, "conversation_id")] = row["id"]
to_output.extend(new_items)
except ValueError:
pass
click.echo(output_rows_as_json(to_output, not data_array))

View file

@ -384,3 +384,10 @@ def multi_schema(schema: dict) -> dict:
"properties": {"items": {"type": "array", "items": schema}},
"required": ["items"],
}
def find_unused_key(item: dict, key: str) -> str:
'Return unused key, e.g. for {"id": "1"} and key "id" returns "id_"'
while key in item:
key += "_"
return key

View file

@ -353,3 +353,58 @@ def test_logs_schema(schema_log_path, args, expected):
)
assert result.exit_code == 0
assert result.output == expected
def test_logs_schema_data_ids(schema_log_path):
db = sqlite_utils.Database(schema_log_path)
ulid = ULID.from_timestamp(time.time() + 100)
db["responses"].insert(
{
"id": str(ulid).lower(),
"system": "system",
"prompt": "prompt",
"response": json.dumps(
{
"name": "three",
"response_id": 1,
"conversation_id": 2,
"conversation_id_": 3,
}
),
"model": "davinci",
"datetime_utc": ulid.datetime.isoformat(),
"conversation_id": "abc123",
"input_tokens": 2,
"output_tokens": 5,
"schema_id": SINGLE_ID,
}
)
runner = CliRunner()
result = runner.invoke(
cli,
[
"logs",
"-n",
"0",
"-p",
str(schema_log_path),
"--data-ids",
"--data-key",
"items",
"--data-array",
],
catch_exceptions=False,
)
assert result.exit_code == 0
rows = json.loads(result.output)
last_row = rows.pop(-1)
assert set(last_row.keys()) == {
"conversation_id_",
"conversation_id",
"response_id",
"response_id_",
"name",
"conversation_id__",
}
for row in rows:
assert set(row.keys()) == {"conversation_id", "response_id", "name"}