From 31d264d9a9150284ac572f3d892243a0cf9c70ab Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 9 Mar 2025 18:56:20 -0500 Subject: [PATCH] Improved llm embed-multi docs, closes #824 --- docs/help.md | 41 +++++++++++++++++++++++++++++------------ llm/cli.py | 44 +++++++++++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 23 deletions(-) diff --git a/docs/help.md b/docs/help.md index 059cce5..7749cab 100644 --- a/docs/help.md +++ b/docs/help.md @@ -70,7 +70,7 @@ Commands: collections View and manage collections of embeddings embed Embed text and store or return the result embed-models Manage available embedding models - embed-multi Store embeddings for multiple strings at once + embed-multi Store embeddings for multiple strings at once in the... install Install packages from PyPI into the same environment as LLM keys Manage stored API keys for different models logs Tools for exploring logged prompts and responses @@ -621,25 +621,42 @@ Options: ``` Usage: llm embed-multi [OPTIONS] COLLECTION [INPUT_PATH] - Store embeddings for multiple strings at once - - Input can be CSV, TSV or a JSON list of objects. - - The first column is treated as an ID - all other columns are assumed to be - text that should be concatenated together in order to calculate the - embeddings. + Store embeddings for multiple strings at once in the specified collection. Input data can come from one of three sources: - 1. A CSV, JSON, TSV or JSON-nl file (including on standard input) - 2. A SQL query against a SQLite database - 3. A directory of files + 1. A CSV, TSV, JSON or JSONL file: + - CSV/TSV: First column is ID, remaining columns concatenated as content + - JSON: Array of objects with "id" field and content fields + - JSONL: Newline-delimited JSON objects + + Examples: + llm embed-multi docs input.csv + cat data.json | llm embed-multi docs - + llm embed-multi docs input.json --format json + + 2. A SQL query against a SQLite database: + - First column returned is used as ID + - Other columns concatenated to form content + + Examples: + llm embed-multi docs --sql "SELECT id, title, body FROM posts" + llm embed-multi docs --attach blog blog.db --sql "SELECT id, content FROM blog.posts" + + 3. Files in directories matching glob patterns: + - Each file becomes one embedding + - Relative file paths become IDs + + Examples: + llm embed-multi docs --files docs '**/*.md' + llm embed-multi images --files photos '*.jpg' --binary + llm embed-multi texts --files texts '*.txt' --encoding utf-8 --encoding latin-1 Options: --format [json|csv|tsv|nl] Format of input file - defaults to auto-detect --files ... Embed files in this directory - specify directory and glob pattern - --encoding TEXT Encoding to use when reading --files + --encoding TEXT Encodings to try when reading --files --binary Treat --files as binary data --sql TEXT Read input using this SQL query --attach ... Additional databases to attach - specify alias diff --git a/llm/cli.py b/llm/cli.py index 0723030..b021c5f 100644 --- a/llm/cli.py +++ b/llm/cli.py @@ -1889,7 +1889,7 @@ def embed( @click.option( "encodings", "--encoding", - help="Encoding to use when reading --files", + help="Encodings to try when reading --files", multiple=True, ) @click.option("--binary", is_flag=True, help="Treat --files as binary data") @@ -1933,20 +1933,42 @@ def embed_multi( database, ): """ - Store embeddings for multiple strings at once - - Input can be CSV, TSV or a JSON list of objects. - - The first column is treated as an ID - all other columns - are assumed to be text that should be concatenated together - in order to calculate the embeddings. + Store embeddings for multiple strings at once in the specified collection. Input data can come from one of three sources: \b - 1. A CSV, JSON, TSV or JSON-nl file (including on standard input) - 2. A SQL query against a SQLite database - 3. A directory of files + 1. A CSV, TSV, JSON or JSONL file: + - CSV/TSV: First column is ID, remaining columns concatenated as content + - JSON: Array of objects with "id" field and content fields + - JSONL: Newline-delimited JSON objects + + \b + Examples: + llm embed-multi docs input.csv + cat data.json | llm embed-multi docs - + llm embed-multi docs input.json --format json + + \b + 2. A SQL query against a SQLite database: + - First column returned is used as ID + - Other columns concatenated to form content + + \b + Examples: + llm embed-multi docs --sql "SELECT id, title, body FROM posts" + llm embed-multi docs --attach blog blog.db --sql "SELECT id, content FROM blog.posts" + + \b + 3. Files in directories matching glob patterns: + - Each file becomes one embedding + - Relative file paths become IDs + + \b + Examples: + llm embed-multi docs --files docs '**/*.md' + llm embed-multi images --files photos '*.jpg' --binary + llm embed-multi texts --files texts '*.txt' --encoding utf-8 --encoding latin-1 """ if binary and not files: raise click.UsageError("--binary must be used with --files")