llm/tests/test_llm_logs.py

from click.testing import CliRunner
from llm.cli import cli
from llm.migrations import migrate
from ulid import ULID
import datetime
import json
import pytest
import re
import sqlite_utils
import sys
import time


SINGLE_ID = "5843577700ba729bb14c327b30441885"
MULTI_ID = "4860edd987df587d042a9eb2b299ce5c"


@pytest.fixture
def log_path(user_path):
    log_path = str(user_path / "logs.db")
    db = sqlite_utils.Database(log_path)
    migrate(db)
    start = datetime.datetime.now(datetime.timezone.utc)
    db["responses"].insert_all(
        {
            "id": str(ULID()).lower(),
            "system": "system",
            "prompt": "prompt",
            "response": 'response\n```python\nprint("hello word")\n```',
            "model": "davinci",
            "datetime_utc": (start + datetime.timedelta(seconds=i)).isoformat(),
            "conversation_id": "abc123",
            "input_tokens": 2,
            "output_tokens": 5,
        }
        for i in range(100)
    )
    return log_path


@pytest.fixture
def schema_log_path(user_path):
    log_path = str(user_path / "logs_schema.db")
    db = sqlite_utils.Database(log_path)
    migrate(db)
    start = datetime.datetime.now(datetime.timezone.utc)
    db["schemas"].insert({"id": SINGLE_ID, "content": '{"name": "string"}'})
    db["schemas"].insert({"id": MULTI_ID, "content": '{"name": "array"}'})
    for i in range(2):
        db["responses"].insert(
            {
                "id": str(ULID.from_timestamp(time.time() + i)).lower(),
                "system": "system",
                "prompt": "prompt",
                "response": '{"name": "' + str(i) + '"}',
                "model": "davinci",
                "datetime_utc": (start + datetime.timedelta(seconds=i)).isoformat(),
                "conversation_id": "abc123",
                "input_tokens": 2,
                "output_tokens": 5,
                "schema_id": SINGLE_ID,
            }
        )
    for j in range(4):
        db["responses"].insert(
            {
                "id": str(ULID.from_timestamp(time.time() + j)).lower(),
                "system": "system",
                "prompt": "prompt",
                "response": '{"items": [{"name": "one"}, {"name": "two"}]}',
                "model": "davinci",
                "datetime_utc": (start + datetime.timedelta(seconds=i)).isoformat(),
                "conversation_id": "abc456",
                "input_tokens": 2,
                "output_tokens": 5,
                "schema_id": MULTI_ID,
            }
        )

    return log_path


datetime_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}")
id_re = re.compile(r"id: \w+")


@pytest.mark.parametrize("usage", (False, True))
def test_logs_text(log_path, usage):
    runner = CliRunner()
    args = ["logs", "-p", str(log_path)]
    if usage:
        args.append("-u")
    result = runner.invoke(cli, args, catch_exceptions=False)
    assert result.exit_code == 0
    output = result.output
    # Replace 2023-08-17T20:53:58 with YYYY-MM-DDTHH:MM:SS
    output = datetime_re.sub("YYYY-MM-DDTHH:MM:SS", output)
    # Replace id: whatever with id: xxx
    output = id_re.sub("id: xxx", output)
    expected = (
        (
            "# YYYY-MM-DDTHH:MM:SS    conversation: abc123 id: xxx\n\n"
            "Model: **davinci**\n\n"
            "## Prompt\n\n"
            "prompt\n\n"
            "## System\n\n"
            "system\n\n"
            "## Response\n\n"
            'response\n```python\nprint("hello word")\n```\n\n'
        )
        + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "")
        + (
            "# YYYY-MM-DDTHH:MM:SS    conversation: abc123 id: xxx\n\n"
            "Model: **davinci**\n\n"
            "## Prompt\n\n"
            "prompt\n\n"
            "## Response\n\n"
            'response\n```python\nprint("hello word")\n```\n\n'
        )
        + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "")
        + (
            "# YYYY-MM-DDTHH:MM:SS    conversation: abc123 id: xxx\n\n"
            "Model: **davinci**\n\n"
            "## Prompt\n\n"
            "prompt\n\n"
            "## Response\n\n"
            'response\n```python\nprint("hello word")\n```\n\n'
        )
        + ("## Token usage:\n\n2 input, 5 output\n\n" if usage else "")
    )
    assert output == expected


@pytest.mark.parametrize("n", (None, 0, 2))
def test_logs_json(n, log_path):
    "Test that logs command correctly returns requested -n records"
    runner = CliRunner()
    args = ["logs", "-p", str(log_path), "--json"]
    if n is not None:
        args.extend(["-n", str(n)])
    result = runner.invoke(cli, args, catch_exceptions=False)
    assert result.exit_code == 0
    logs = json.loads(result.output)
    expected_length = 3
    if n is not None:
        if n == 0:
            expected_length = 100
        else:
            expected_length = n
    assert len(logs) == expected_length


@pytest.mark.parametrize(
    "args", (["-r"], ["--response"], ["list", "-r"], ["list", "--response"])
)
def test_logs_response_only(args, log_path):
    "Test that logs -r/--response returns just the last response"
    runner = CliRunner()
    result = runner.invoke(cli, ["logs"] + args, catch_exceptions=False)
    assert result.exit_code == 0
    assert result.output == 'response\n```python\nprint("hello word")\n```\n'


@pytest.mark.parametrize(
    "args",
    (
        ["-x"],
        ["--extract"],
        ["list", "-x"],
        ["list", "--extract"],
        # Using -xr together should have same effect as just -x
        ["-xr"],
        ["-x", "-r"],
        ["--extract", "--response"],
    ),
)
def test_logs_extract_first_code(args, log_path):
    "Test that logs -x/--extract returns the first code block"
    runner = CliRunner()
    result = runner.invoke(cli, ["logs"] + args, catch_exceptions=False)
    assert result.exit_code == 0
    assert result.output == 'print("hello word")\n\n'


@pytest.mark.parametrize(
    "args",
    (
        ["--xl"],
        ["--extract-last"],
        ["list", "--xl"],
        ["list", "--extract-last"],
        ["--xl", "-r"],
        ["-x", "--xl"],
    ),
)
def test_logs_extract_last_code(args, log_path):
    "Test that logs --xl/--extract-last returns the last code block"
    runner = CliRunner()
    result = runner.invoke(cli, ["logs"] + args, catch_exceptions=False)
    assert result.exit_code == 0
    assert result.output == 'print("hello word")\n\n'


@pytest.mark.parametrize("arg", ("-s", "--short"))
@pytest.mark.parametrize("usage", (None, "-u", "--usage"))
def test_logs_short(log_path, arg, usage):
    runner = CliRunner()
    args = ["logs", arg, "-p", str(log_path)]
    if usage:
        args.append(usage)
    result = runner.invoke(cli, args)
    assert result.exit_code == 0
    output = datetime_re.sub("YYYY-MM-DDTHH:MM:SS", result.output)
    expected_usage = ""
    if usage:
        expected_usage = "  usage:\n    input: 2\n    output: 5\n"
    expected = (
        "- model: davinci\n"
        "  datetime: 'YYYY-MM-DDTHH:MM:SS'\n"
        "  conversation: abc123\n"
        "  system: system\n"
        f"  prompt: prompt\n{expected_usage}"
        "- model: davinci\n"
        "  datetime: 'YYYY-MM-DDTHH:MM:SS'\n"
        "  conversation: abc123\n"
        "  system: system\n"
        f"  prompt: prompt\n{expected_usage}"
        "- model: davinci\n"
        "  datetime: 'YYYY-MM-DDTHH:MM:SS'\n"
        "  conversation: abc123\n"
        "  system: system\n"
        f"  prompt: prompt\n{expected_usage}"
    )
    assert output == expected


@pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
@pytest.mark.parametrize("env", ({}, {"LLM_USER_PATH": "/tmp/llm-user-path"}))
def test_logs_path(monkeypatch, env, user_path):
    for key, value in env.items():
        monkeypatch.setenv(key, value)
    runner = CliRunner()
    result = runner.invoke(cli, ["logs", "path"])
    assert result.exit_code == 0
    if env:
        expected = env["LLM_USER_PATH"] + "/logs.db"
    else:
        expected = str(user_path) + "/logs.db"
    assert result.output.strip() == expected


@pytest.mark.parametrize("model", ("davinci", "curie"))
def test_logs_filtered(user_path, model):
    log_path = str(user_path / "logs.db")
    db = sqlite_utils.Database(log_path)
    migrate(db)
    db["responses"].insert_all(
        {
            "id": str(ULID()).lower(),
            "system": "system",
            "prompt": "prompt",
            "response": "response",
            "model": "davinci" if i % 2 == 0 else "curie",
        }
        for i in range(100)
    )
    runner = CliRunner()
    result = runner.invoke(cli, ["logs", "list", "-m", model, "--json"])
    assert result.exit_code == 0
    records = json.loads(result.output.strip())
    assert all(record["model"] == model for record in records)


@pytest.mark.parametrize(
    "query,extra_args,expected",
    (
        # With no search term order should be by datetime
        ("", [], ["doc1", "doc2", "doc3"]),
        # With a search it's order by rank instead
        ("llama", [], ["doc1", "doc3"]),
        ("alpaca", [], ["doc2"]),
        # Model filter should work too
        ("llama", ["-m", "davinci"], ["doc1", "doc3"]),
        ("llama", ["-m", "davinci2"], []),
    ),
)
def test_logs_search(user_path, query, extra_args, expected):
    log_path = str(user_path / "logs.db")
    db = sqlite_utils.Database(log_path)
    migrate(db)

    def _insert(id, text):
        db["responses"].insert(
            {
                "id": id,
                "system": "system",
                "prompt": text,
                "response": "response",
                "model": "davinci",
            }
        )

    _insert("doc1", "llama")
    _insert("doc2", "alpaca")
    _insert("doc3", "llama llama")
    runner = CliRunner()
    result = runner.invoke(cli, ["logs", "list", "-q", query, "--json"] + extra_args)
    assert result.exit_code == 0
    records = json.loads(result.output.strip())
    assert [record["id"] for record in records] == expected


@pytest.mark.parametrize(
    "args,expected",
    (
        (["--data", "--schema", SINGLE_ID], '{"name": "1"}\n{"name": "0"}\n'),
        (
            ["--data", "--schema", MULTI_ID],
            (
                '{"items": [{"name": "one"}, {"name": "two"}]}\n'
                '{"items": [{"name": "one"}, {"name": "two"}]}\n'
                '{"items": [{"name": "one"}, {"name": "two"}]}\n'
                '{"items": [{"name": "one"}, {"name": "two"}]}\n'
            ),
        ),
        (
            ["--data-array", "--schema", MULTI_ID],
            (
                '[{"items": [{"name": "one"}, {"name": "two"}]},\n'
                ' {"items": [{"name": "one"}, {"name": "two"}]},\n'
                ' {"items": [{"name": "one"}, {"name": "two"}]},\n'
                ' {"items": [{"name": "one"}, {"name": "two"}]}]\n'
            ),
        ),
        (
            ["--schema", MULTI_ID, "--data-key", "items"],
            (
                '{"name": "one"}\n'
                '{"name": "two"}\n'
                '{"name": "one"}\n'
                '{"name": "two"}\n'
                '{"name": "one"}\n'
                '{"name": "two"}\n'
                '{"name": "one"}\n'
                '{"name": "two"}\n'
            ),
        ),
    ),
)
def test_logs_schema(schema_log_path, args, expected):
    runner = CliRunner()
    result = runner.invoke(
        cli,
        ["logs", "-n", "0", "-p", str(schema_log_path)] + args,
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    assert result.output == expected


def test_logs_schema_data_ids(schema_log_path):
    db = sqlite_utils.Database(schema_log_path)
    ulid = ULID.from_timestamp(time.time() + 100)
    db["responses"].insert(
        {
            "id": str(ulid).lower(),
            "system": "system",
            "prompt": "prompt",
            "response": json.dumps(
                {
                    "name": "three",
                    "response_id": 1,
                    "conversation_id": 2,
                    "conversation_id_": 3,
                }
            ),
            "model": "davinci",
            "datetime_utc": ulid.datetime.isoformat(),
            "conversation_id": "abc123",
            "input_tokens": 2,
            "output_tokens": 5,
            "schema_id": SINGLE_ID,
        }
    )
    runner = CliRunner()
    result = runner.invoke(
        cli,
        [
            "logs",
            "-n",
            "0",
            "-p",
            str(schema_log_path),
            "--data-ids",
            "--data-key",
            "items",
            "--data-array",
        ],
        catch_exceptions=False,
    )
    assert result.exit_code == 0
    rows = json.loads(result.output)
    last_row = rows.pop(-1)
    assert set(last_row.keys()) == {
        "conversation_id_",
        "conversation_id",
        "response_id",
        "response_id_",
        "name",
        "conversation_id__",
    }
    for row in rows:
        assert set(row.keys()) == {"conversation_id", "response_id", "name"}