From 31d264d9a9150284ac572f3d892243a0cf9c70ab Mon Sep 17 00:00:00 2001
From: Simon Willison <swillison@gmail.com>
Date: Sun, 9 Mar 2025 18:56:20 -0500
Subject: [PATCH] Improved llm embed-multi docs, closes #824

---
 docs/help.md | 41 +++++++++++++++++++++++++++++------------
 llm/cli.py   | 44 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 62 insertions(+), 23 deletions(-)
diff --git a/docs/help.md b/docs/help.md
index 059cce5..7749cab 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -70,7 +70,7 @@ Commands:
   collections   View and manage collections of embeddings
   embed         Embed text and store or return the result
   embed-models  Manage available embedding models
-  embed-multi   Store embeddings for multiple strings at once
+  embed-multi   Store embeddings for multiple strings at once in the...
   install       Install packages from PyPI into the same environment as LLM
   keys          Manage stored API keys for different models
   logs          Tools for exploring logged prompts and responses
@@ -621,25 +621,42 @@ Options:
 ```
 Usage: llm embed-multi [OPTIONS] COLLECTION [INPUT_PATH]
 
-  Store embeddings for multiple strings at once
-
-  Input can be CSV, TSV or a JSON list of objects.
-
-  The first column is treated as an ID - all other columns are assumed to be
-  text that should be concatenated together in order to calculate the
-  embeddings.
+  Store embeddings for multiple strings at once in the specified collection.
 
   Input data can come from one of three sources:
 
-  1. A CSV, JSON, TSV or JSON-nl file (including on standard input)
-  2. A SQL query against a SQLite database
-  3. A directory of files
+  1. A CSV, TSV, JSON or JSONL file:
+     - CSV/TSV: First column is ID, remaining columns concatenated as content
+     - JSON: Array of objects with "id" field and content fields
+     - JSONL: Newline-delimited JSON objects
+
+     Examples:
+       llm embed-multi docs input.csv
+       cat data.json | llm embed-multi docs -
+       llm embed-multi docs input.json --format json
+
+  2. A SQL query against a SQLite database:
+     - First column returned is used as ID
+     - Other columns concatenated to form content
+
+     Examples:
+       llm embed-multi docs --sql "SELECT id, title, body FROM posts"
+       llm embed-multi docs --attach blog blog.db --sql "SELECT id, content FROM blog.posts"
+
+  3. Files in directories matching glob patterns:
+     - Each file becomes one embedding
+     - Relative file paths become IDs
+
+     Examples:
+       llm embed-multi docs --files docs '**/*.md'
+       llm embed-multi images --files photos '*.jpg' --binary
+       llm embed-multi texts --files texts '*.txt' --encoding utf-8 --encoding latin-1
 
 Options:
   --format [json|csv|tsv|nl]   Format of input file - defaults to auto-detect
   --files <DIRECTORY TEXT>...  Embed files in this directory - specify directory
                                and glob pattern
-  --encoding TEXT              Encoding to use when reading --files
+  --encoding TEXT              Encodings to try when reading --files
   --binary                     Treat --files as binary data
   --sql TEXT                   Read input using this SQL query
   --attach <TEXT FILE>...      Additional databases to attach - specify alias
diff --git a/llm/cli.py b/llm/cli.py
index 0723030..b021c5f 100644
--- a/llm/cli.py
+++ b/llm/cli.py
@@ -1889,7 +1889,7 @@ def embed(
 @click.option(
     "encodings",
     "--encoding",
-    help="Encoding to use when reading --files",
+    help="Encodings to try when reading --files",
     multiple=True,
 )
 @click.option("--binary", is_flag=True, help="Treat --files as binary data")
@@ -1933,20 +1933,42 @@ def embed_multi(
     database,
 ):
     """
-    Store embeddings for multiple strings at once
-
-    Input can be CSV, TSV or a JSON list of objects.
-
-    The first column is treated as an ID - all other columns
-    are assumed to be text that should be concatenated together
-    in order to calculate the embeddings.
+    Store embeddings for multiple strings at once in the specified collection.
 
     Input data can come from one of three sources:
 
     \b
-    1. A CSV, JSON, TSV or JSON-nl file (including on standard input)
-    2. A SQL query against a SQLite database
-    3. A directory of files
+    1. A CSV, TSV, JSON or JSONL file:
+       - CSV/TSV: First column is ID, remaining columns concatenated as content
+       - JSON: Array of objects with "id" field and content fields
+       - JSONL: Newline-delimited JSON objects
+
+    \b
+       Examples:
+         llm embed-multi docs input.csv
+         cat data.json | llm embed-multi docs -
+         llm embed-multi docs input.json --format json
+
+    \b
+    2. A SQL query against a SQLite database:
+       - First column returned is used as ID
+       - Other columns concatenated to form content
+
+    \b
+       Examples:
+         llm embed-multi docs --sql "SELECT id, title, body FROM posts"
+         llm embed-multi docs --attach blog blog.db --sql "SELECT id, content FROM blog.posts"
+
+    \b
+    3. Files in directories matching glob patterns:
+       - Each file becomes one embedding
+       - Relative file paths become IDs
+
+    \b
+       Examples:
+         llm embed-multi docs --files docs '**/*.md'
+         llm embed-multi images --files photos '*.jpg' --binary
+         llm embed-multi texts --files texts '*.txt' --encoding utf-8 --encoding latin-1
     """
     if binary and not files:
         raise click.UsageError("--binary must be used with --files")