llm logs --data-ids flag, closes #800

2026-05-21 20:11:52 +00:00 · 2025-02-27 20:31:50 -08:00 · 2025-02-27 20:31:50 -08:00 · 48f67f4085
commit 48f67f4085
parent 1bebf8b34a
5 changed files with 92 additions and 4 deletions
--- a/docs/help.md
+++ b/docs/help.md
@ -307,6 +307,7 @@ Options:
  --data                      Output newline-delimited JSON data for schema
  --data-array                Output JSON array of data for schema
  --data-key TEXT             Return JSON objects from array in this key
+  --data-ids                  Attach corresponding IDs to JSON objects
  -t, --truncate              Truncate long strings in output
  -s, --short                 Shorter YAML output with truncated prompts
  -u, --usage                 Include token usage
--- a/docs/schemas.md
+++ b/docs/schemas.md
@ -207,4 +207,18 @@ Output:
 {"name": "Ziggy", "ten_word_bio": "Quirky pug who loves belly rubs and quirky outfits."},
 {"name": "Robo", "ten_word_bio": "A cybernetic dog with laser eyes and super intelligence."},
 {"name": "Flamepaw", "ten_word_bio": "Fire-resistant dog with a talent for agility and tricks."}]
-```
+```
+Add `--data-ids` to include `"response_id"` and `"conversation_id"` fields in each of the returned objects reflecting the database IDs of the response and conversation they were a part of. This can be useful for tracking the source of each individual row.
+
+```bash
+llm logs --schema-multi 'name, ten_word_bio' --data-key items --data-ids
+```
+Output:
+```json
+{"name": "Nebula", "ten_word_bio": "A cosmic puppy with starry fur, loves adventures in space.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"}
+{"name": "Echo", "ten_word_bio": "A clever hound with extraordinary hearing, master of hide-and-seek.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"}
+{"name": "Biscuit", "ten_word_bio": "An adorable chef dog, bakes treats that everyone loves.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"}
+{"name": "Cosmo", "ten_word_bio": "Galactic explorer, loves adventures and chasing shooting stars.", "response_id": "01jn4daycb3svj0x7kvp7zrp4q", "conversation_id": "01jn4daycb3svj0x7kvp7zrp4q"}
+{"name": "Pixel", "ten_word_bio": "Tech-savvy pup, builds gadgets and loves virtual playtime.", "response_id": "01jn4daycb3svj0x7kvp7zrp4q", "conversation_id": "01jn4daycb3svj0x7kvp7zrp4q"}
+```
+If a row already has a property called `"conversation_id"` or `"response_id"` additional underscores will be appended to the ID key until it no longer overlaps with the existing keys.
--- a/llm/cli.py
+++ b/llm/cli.py
@ -48,6 +48,7 @@ from .utils import (
    schema_summary,
    multi_schema,
    schema_dsl,
+    find_unused_key,
 )
 import base64
 import httpx
@ -939,6 +940,9 @@ order by prompt_attachments."order"
 )
@click.option("--data-array", is_flag=True, help="Output JSON array of data for schema")
@click.option("--data-key", help="Return JSON objects from array in this key")
+@click.option(
+    "--data-ids", is_flag=True, help="Attach corresponding IDs to JSON objects"
+)
@click.option("-t", "--truncate", is_flag=True, help="Truncate long strings in output")
@click.option(
    "-s", "--short", is_flag=True, help="Shorter YAML output with truncated prompts"
@ -983,6 +987,7 @@ def logs_list(
    data,
    data_array,
    data_key,
+    data_ids,
    truncate,
    short,
    usage,
@ -1099,22 +1104,28 @@ def logs_list(
    for attachment in attachments:
        attachments_by_id.setdefault(attachment["response_id"], []).append(attachment)

-    if data or data_array or data_key:
+    if data or data_array or data_key or data_ids:
        # Special case for --data to output valid JSON
        to_output = []
        for row in rows:
            response = row["response"] or ""
            try:
                decoded = json.loads(response)
+                new_items = []
                if (
                    isinstance(decoded, dict)
                    and (data_key in decoded)
                    and all(isinstance(item, dict) for item in decoded[data_key])
                ):
                    for item in decoded[data_key]:
-                        to_output.append(item)
+                        new_items.append(item)
                else:
-                    to_output.append(decoded)
+                    new_items.append(decoded)
+                if data_ids:
+                    for item in new_items:
+                        item[find_unused_key(item, "response_id")] = row["id"]
+                        item[find_unused_key(item, "conversation_id")] = row["id"]
+                to_output.extend(new_items)
            except ValueError:
                pass
        click.echo(output_rows_as_json(to_output, not data_array))
--- a/llm/utils.py
+++ b/llm/utils.py
@ -384,3 +384,10 @@ def multi_schema(schema: dict) -> dict:
        "properties": {"items": {"type": "array", "items": schema}},
        "required": ["items"],
    }
+
+
+def find_unused_key(item: dict, key: str) -> str:
+    'Return unused key, e.g. for {"id": "1"} and key "id" returns "id_"'
+    while key in item:
+        key += "_"
+    return key
--- a/tests/test_llm_logs.py
+++ b/tests/test_llm_logs.py
@ -353,3 +353,58 @@ def test_logs_schema(schema_log_path, args, expected):
    )
    assert result.exit_code == 0
    assert result.output == expected
+
+
+def test_logs_schema_data_ids(schema_log_path):
+    db = sqlite_utils.Database(schema_log_path)
+    ulid = ULID.from_timestamp(time.time() + 100)
+    db["responses"].insert(
+        {
+            "id": str(ulid).lower(),
+            "system": "system",
+            "prompt": "prompt",
+            "response": json.dumps(
+                {
+                    "name": "three",
+                    "response_id": 1,
+                    "conversation_id": 2,
+                    "conversation_id_": 3,
+                }
+            ),
+            "model": "davinci",
+            "datetime_utc": ulid.datetime.isoformat(),
+            "conversation_id": "abc123",
+            "input_tokens": 2,
+            "output_tokens": 5,
+            "schema_id": SINGLE_ID,
+        }
+    )
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "logs",
+            "-n",
+            "0",
+            "-p",
+            str(schema_log_path),
+            "--data-ids",
+            "--data-key",
+            "items",
+            "--data-array",
+        ],
+        catch_exceptions=False,
+    )
+    assert result.exit_code == 0
+    rows = json.loads(result.output)
+    last_row = rows.pop(-1)
+    assert set(last_row.keys()) == {
+        "conversation_id_",
+        "conversation_id",
+        "response_id",
+        "response_id_",
+        "name",
+        "conversation_id__",
+    }
+    for row in rows:
+        assert set(row.keys()) == {"conversation_id", "response_id", "name"}