llm embed-multi --prepend option (#746)

* llm embed-multi --prepend option Closes #745
2026-05-02 19:04:52 +00:00 · 2025-02-12 15:19:18 -08:00 · 2025-02-12 15:19:18 -08:00 · 9a1374b447
commit 9a1374b447
parent f67c21522b
4 changed files with 61 additions and 25 deletions
--- a/docs/embeddings/cli.md
+++ b/docs/embeddings/cli.md
@ -148,8 +148,11 @@ All three mechanisms support these options:
 - `-d database.db` to specify a different database file to store the embeddings in
 - `--store` to store the original content in the embeddings table in addition to the embedding vector
 - `--prefix` to prepend a prefix to the stored ID of each item
+- `--prepend` to prepend a string to the content before embedding 
 - `--batch-size SIZE` to process embeddings in batches of the specified size

+The `--prepend` option is useful for embedding models that require you to prepend a special token to the content before embedding it. [nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) for example requires documents to be prepended `'search_document: '` and search queries to be prepended `'search_query: '`.
+
 (embeddings-cli-embed-multi-csv-etc)=
 ### Embedding data from a CSV, TSV or JSON file

--- a/docs/help.md
+++ b/docs/help.md
@ -573,6 +573,8 @@ Options:
  --batch-size INTEGER         Batch size to use when running embeddings
  --prefix TEXT                Prefix to add to the IDs
  -m, --model TEXT             Embedding model to use
+  --prepend TEXT               Prepend this string to all content before
+                               embedding
  --store                      Store the text itself in the database
  -d, --database FILE
  --help                       Show this message and exit.
--- a/llm/cli.py
+++ b/llm/cli.py
@ -1574,6 +1574,10 @@ def embed(
 )
@click.option("--prefix", help="Prefix to add to the IDs", default="")
@click.option("-m", "--model", help="Embedding model to use")
+@click.option(
+    "--prepend",
+    help="Prepend this string to all content before embedding",
+)
@click.option("--store", is_flag=True, help="Store the text itself in the database")
@click.option(
    "-d",
@ -1593,6 +1597,7 @@ def embed_multi(
    batch_size,
    prefix,
    model,
+    prepend,
    store,
    database,
 ):
@ -1715,11 +1720,15 @@ def embed_multi(
        def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
            for row in rows:
                values = list(row.values())
-                id = prefix + str(values[0])
+                id: str = prefix + str(values[0])
+                content: Optional[Union[bytes, str]] = None
                if binary:
-                    yield id, cast(bytes, values[1])
+                    content = cast(bytes, values[1])
                else:
-                    yield id, " ".join(v or "" for v in values[1:])
+                    content = " ".join(v or "" for v in values[1:])
+                if prepend and isinstance(content, str):
+                    content = prepend + content
+                yield id, content or ""

        embed_kwargs = {"store": store}
        if batch_size:
--- a/tests/test_embed_cli.py
+++ b/tests/test_embed_cli.py
@ -250,6 +250,7 @@ def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):

@pytest.mark.parametrize("use_stdin", (False, True))
@pytest.mark.parametrize("prefix", (None, "prefix"))
+@pytest.mark.parametrize("prepend", (None, "search_document: "))
@pytest.mark.parametrize(
    "filename,content",
    (
@ -265,7 +266,7 @@ def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):
        ),
    ),
 )
-def test_embed_multi_file_input(tmpdir, use_stdin, prefix, filename, content):
+def test_embed_multi_file_input(tmpdir, use_stdin, prefix, prepend, filename, content):
    db_path = tmpdir / "embeddings.db"
    args = ["embed-multi", "phrases", "-d", str(db_path), "-m", "embed-demo"]
    input = None
@ -278,6 +279,8 @@ def test_embed_multi_file_input(tmpdir, use_stdin, prefix, filename, content):
        args.append(str(path))
    if prefix:
        args.extend(("--prefix", prefix))
+    if prepend:
+        args.extend(("--prepend", prepend))
    # Auto-detection can't detect JSON-nl, so make that explicit
    if filename.endswith(".jsonl"):
        args.extend(("--format", "nl"))
@ -325,7 +328,8 @@ def test_embed_multi_files_binary_store(tmpdir):

@pytest.mark.parametrize("use_other_db", (True, False))
@pytest.mark.parametrize("prefix", (None, "prefix"))
-def test_embed_multi_sql(tmpdir, use_other_db, prefix):
+@pytest.mark.parametrize("prepend", (None, "search_document: "))
+def test_embed_multi_sql(tmpdir, use_other_db, prefix, prepend):
    db_path = str(tmpdir / "embeddings.db")
    db = sqlite_utils.Database(db_path)
    extra_args = []
@ -336,6 +340,8 @@ def test_embed_multi_sql(tmpdir, use_other_db, prefix):

    if prefix:
        extra_args.extend(("--prefix", prefix))
+    if prepend:
+        extra_args.extend(("--prepend", prepend))

    db["content"].insert_all(
        [
@ -365,8 +371,14 @@ def test_embed_multi_sql(tmpdir, use_other_db, prefix):
    assert embeddings_db["embeddings"].count == 2
    rows = list(embeddings_db.query("select id, content from embeddings order by id"))
    assert rows == [
-        {"id": (prefix or "") + "1", "content": "cli Command line interface"},
-        {"id": (prefix or "") + "2", "content": "sql Structured query language"},
+        {
+            "id": (prefix or "") + "1",
+            "content": (prepend or "") + "cli Command line interface",
+        },
+        {
+            "id": (prefix or "") + "2",
+            "content": (prepend or "") + "sql Structured query language",
+        },
    ]


@ -425,7 +437,8 @@ def multi_files(tmpdir):

@pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
@pytest.mark.parametrize("scenario", ("single", "multi"))
-def test_embed_multi_files(multi_files, scenario):
+@pytest.mark.parametrize("prepend", (None, "search_document: "))
+def test_embed_multi_files(multi_files, scenario, prepend):
    db_path, files = multi_files
    for filename, content in (
        ("file1.txt", b"hello world"),
@ -440,17 +453,23 @@ def test_embed_multi_files(multi_files, scenario):
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_bytes(content)

+    extra_args = []
+
+    if prepend:
+        extra_args.extend(("--prepend", prepend))
    if scenario == "single":
-        extra_args = ["--files", str(files), "**/*.txt"]
+        extra_args.extend(["--files", str(files), "**/*.txt"])
    else:
-        extra_args = [
-            "--files",
-            str(files / "nested" / "more"),
-            "**/*.ini",
-            "--files",
-            str(files / "nested"),
-            "*.txt",
-        ]
+        extra_args.extend(
+            [
+                "--files",
+                str(files / "nested" / "more"),
+                "**/*.ini",
+                "--files",
+                str(files / "nested"),
+                "*.txt",
+            ]
+        )

    runner = CliRunner()
    result = runner.invoke(
@ -471,17 +490,20 @@ def test_embed_multi_files(multi_files, scenario):
    rows = list(embeddings_db.query("select id, content from embeddings order by id"))
    if scenario == "single":
        assert rows == [
-            {"id": "file1.txt", "content": "hello world"},
-            {"id": "file2.txt", "content": "goodbye world"},
-            {"id": "nested/more/three.txt", "content": "three"},
-            {"id": "nested/one.txt", "content": "one"},
-            {"id": "nested/two.txt", "content": "two"},
+            {"id": "file1.txt", "content": (prepend or "") + "hello world"},
+            {"id": "file2.txt", "content": (prepend or "") + "goodbye world"},
+            {"id": "nested/more/three.txt", "content": (prepend or "") + "three"},
+            {"id": "nested/one.txt", "content": (prepend or "") + "one"},
+            {"id": "nested/two.txt", "content": (prepend or "") + "two"},
        ]
    else:
        assert rows == [
-            {"id": "ignored.ini", "content": "Has weird \x96 character"},
-            {"id": "one.txt", "content": "one"},
-            {"id": "two.txt", "content": "two"},
+            {
+                "id": "ignored.ini",
+                "content": (prepend or "") + "Has weird \x96 character",
+            },
+            {"id": "one.txt", "content": (prepend or "") + "one"},
+            {"id": "two.txt", "content": (prepend or "") + "two"},
        ]