diff --git a/docs/help.md b/docs/help.md index 8210abf..4ec83fc 100644 --- a/docs/help.md +++ b/docs/help.md @@ -307,6 +307,7 @@ Options: --data Output newline-delimited JSON data for schema --data-array Output JSON array of data for schema --data-key TEXT Return JSON objects from array in this key + --data-ids Attach corresponding IDs to JSON objects -t, --truncate Truncate long strings in output -s, --short Shorter YAML output with truncated prompts -u, --usage Include token usage diff --git a/docs/schemas.md b/docs/schemas.md index bbabf79..5b3317a 100644 --- a/docs/schemas.md +++ b/docs/schemas.md @@ -207,4 +207,18 @@ Output: {"name": "Ziggy", "ten_word_bio": "Quirky pug who loves belly rubs and quirky outfits."}, {"name": "Robo", "ten_word_bio": "A cybernetic dog with laser eyes and super intelligence."}, {"name": "Flamepaw", "ten_word_bio": "Fire-resistant dog with a talent for agility and tricks."}] -``` \ No newline at end of file +``` +Add `--data-ids` to include `"response_id"` and `"conversation_id"` fields in each of the returned objects reflecting the database IDs of the response and conversation they were a part of. This can be useful for tracking the source of each individual row. + +```bash +llm logs --schema-multi 'name, ten_word_bio' --data-key items --data-ids +``` +Output: +```json +{"name": "Nebula", "ten_word_bio": "A cosmic puppy with starry fur, loves adventures in space.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"} +{"name": "Echo", "ten_word_bio": "A clever hound with extraordinary hearing, master of hide-and-seek.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"} +{"name": "Biscuit", "ten_word_bio": "An adorable chef dog, bakes treats that everyone loves.", "response_id": "01jn4dawj8sq0c6t3emf4k5ryx", "conversation_id": "01jn4dawj8sq0c6t3emf4k5ryx"} +{"name": "Cosmo", "ten_word_bio": "Galactic explorer, loves adventures and chasing shooting stars.", "response_id": "01jn4daycb3svj0x7kvp7zrp4q", "conversation_id": "01jn4daycb3svj0x7kvp7zrp4q"} +{"name": "Pixel", "ten_word_bio": "Tech-savvy pup, builds gadgets and loves virtual playtime.", "response_id": "01jn4daycb3svj0x7kvp7zrp4q", "conversation_id": "01jn4daycb3svj0x7kvp7zrp4q"} +``` +If a row already has a property called `"conversation_id"` or `"response_id"` additional underscores will be appended to the ID key until it no longer overlaps with the existing keys. \ No newline at end of file diff --git a/llm/cli.py b/llm/cli.py index 7dd6543..16a0d65 100644 --- a/llm/cli.py +++ b/llm/cli.py @@ -48,6 +48,7 @@ from .utils import ( schema_summary, multi_schema, schema_dsl, + find_unused_key, ) import base64 import httpx @@ -939,6 +940,9 @@ order by prompt_attachments."order" ) @click.option("--data-array", is_flag=True, help="Output JSON array of data for schema") @click.option("--data-key", help="Return JSON objects from array in this key") +@click.option( + "--data-ids", is_flag=True, help="Attach corresponding IDs to JSON objects" +) @click.option("-t", "--truncate", is_flag=True, help="Truncate long strings in output") @click.option( "-s", "--short", is_flag=True, help="Shorter YAML output with truncated prompts" @@ -983,6 +987,7 @@ def logs_list( data, data_array, data_key, + data_ids, truncate, short, usage, @@ -1099,22 +1104,28 @@ def logs_list( for attachment in attachments: attachments_by_id.setdefault(attachment["response_id"], []).append(attachment) - if data or data_array or data_key: + if data or data_array or data_key or data_ids: # Special case for --data to output valid JSON to_output = [] for row in rows: response = row["response"] or "" try: decoded = json.loads(response) + new_items = [] if ( isinstance(decoded, dict) and (data_key in decoded) and all(isinstance(item, dict) for item in decoded[data_key]) ): for item in decoded[data_key]: - to_output.append(item) + new_items.append(item) else: - to_output.append(decoded) + new_items.append(decoded) + if data_ids: + for item in new_items: + item[find_unused_key(item, "response_id")] = row["id"] + item[find_unused_key(item, "conversation_id")] = row["id"] + to_output.extend(new_items) except ValueError: pass click.echo(output_rows_as_json(to_output, not data_array)) diff --git a/llm/utils.py b/llm/utils.py index 946df69..dd4ecfb 100644 --- a/llm/utils.py +++ b/llm/utils.py @@ -384,3 +384,10 @@ def multi_schema(schema: dict) -> dict: "properties": {"items": {"type": "array", "items": schema}}, "required": ["items"], } + + +def find_unused_key(item: dict, key: str) -> str: + 'Return unused key, e.g. for {"id": "1"} and key "id" returns "id_"' + while key in item: + key += "_" + return key diff --git a/tests/test_llm_logs.py b/tests/test_llm_logs.py index b7ef6a3..79f312e 100644 --- a/tests/test_llm_logs.py +++ b/tests/test_llm_logs.py @@ -353,3 +353,58 @@ def test_logs_schema(schema_log_path, args, expected): ) assert result.exit_code == 0 assert result.output == expected + + +def test_logs_schema_data_ids(schema_log_path): + db = sqlite_utils.Database(schema_log_path) + ulid = ULID.from_timestamp(time.time() + 100) + db["responses"].insert( + { + "id": str(ulid).lower(), + "system": "system", + "prompt": "prompt", + "response": json.dumps( + { + "name": "three", + "response_id": 1, + "conversation_id": 2, + "conversation_id_": 3, + } + ), + "model": "davinci", + "datetime_utc": ulid.datetime.isoformat(), + "conversation_id": "abc123", + "input_tokens": 2, + "output_tokens": 5, + "schema_id": SINGLE_ID, + } + ) + runner = CliRunner() + result = runner.invoke( + cli, + [ + "logs", + "-n", + "0", + "-p", + str(schema_log_path), + "--data-ids", + "--data-key", + "items", + "--data-array", + ], + catch_exceptions=False, + ) + assert result.exit_code == 0 + rows = json.loads(result.output) + last_row = rows.pop(-1) + assert set(last_row.keys()) == { + "conversation_id_", + "conversation_id", + "response_id", + "response_id_", + "name", + "conversation_id__", + } + for row in rows: + assert set(row.keys()) == {"conversation_id", "response_id", "name"}