mirror of
https://github.com/Hopiu/llm.git
synced 2026-05-02 19:04:52 +00:00
llm embed-multi --prepend option (#746)
* llm embed-multi --prepend option Closes #745
This commit is contained in:
parent
f67c21522b
commit
9a1374b447
4 changed files with 61 additions and 25 deletions
|
|
@ -148,8 +148,11 @@ All three mechanisms support these options:
|
|||
- `-d database.db` to specify a different database file to store the embeddings in
|
||||
- `--store` to store the original content in the embeddings table in addition to the embedding vector
|
||||
- `--prefix` to prepend a prefix to the stored ID of each item
|
||||
- `--prepend` to prepend a string to the content before embedding
|
||||
- `--batch-size SIZE` to process embeddings in batches of the specified size
|
||||
|
||||
The `--prepend` option is useful for embedding models that require you to prepend a special token to the content before embedding it. [nomic-embed-text-v2-moe](https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe) for example requires documents to be prepended `'search_document: '` and search queries to be prepended `'search_query: '`.
|
||||
|
||||
(embeddings-cli-embed-multi-csv-etc)=
|
||||
### Embedding data from a CSV, TSV or JSON file
|
||||
|
||||
|
|
|
|||
|
|
@ -573,6 +573,8 @@ Options:
|
|||
--batch-size INTEGER Batch size to use when running embeddings
|
||||
--prefix TEXT Prefix to add to the IDs
|
||||
-m, --model TEXT Embedding model to use
|
||||
--prepend TEXT Prepend this string to all content before
|
||||
embedding
|
||||
--store Store the text itself in the database
|
||||
-d, --database FILE
|
||||
--help Show this message and exit.
|
||||
|
|
|
|||
15
llm/cli.py
15
llm/cli.py
|
|
@ -1574,6 +1574,10 @@ def embed(
|
|||
)
|
||||
@click.option("--prefix", help="Prefix to add to the IDs", default="")
|
||||
@click.option("-m", "--model", help="Embedding model to use")
|
||||
@click.option(
|
||||
"--prepend",
|
||||
help="Prepend this string to all content before embedding",
|
||||
)
|
||||
@click.option("--store", is_flag=True, help="Store the text itself in the database")
|
||||
@click.option(
|
||||
"-d",
|
||||
|
|
@ -1593,6 +1597,7 @@ def embed_multi(
|
|||
batch_size,
|
||||
prefix,
|
||||
model,
|
||||
prepend,
|
||||
store,
|
||||
database,
|
||||
):
|
||||
|
|
@ -1715,11 +1720,15 @@ def embed_multi(
|
|||
def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
|
||||
for row in rows:
|
||||
values = list(row.values())
|
||||
id = prefix + str(values[0])
|
||||
id: str = prefix + str(values[0])
|
||||
content: Optional[Union[bytes, str]] = None
|
||||
if binary:
|
||||
yield id, cast(bytes, values[1])
|
||||
content = cast(bytes, values[1])
|
||||
else:
|
||||
yield id, " ".join(v or "" for v in values[1:])
|
||||
content = " ".join(v or "" for v in values[1:])
|
||||
if prepend and isinstance(content, str):
|
||||
content = prepend + content
|
||||
yield id, content or ""
|
||||
|
||||
embed_kwargs = {"store": store}
|
||||
if batch_size:
|
||||
|
|
|
|||
|
|
@ -250,6 +250,7 @@ def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):
|
|||
|
||||
@pytest.mark.parametrize("use_stdin", (False, True))
|
||||
@pytest.mark.parametrize("prefix", (None, "prefix"))
|
||||
@pytest.mark.parametrize("prepend", (None, "search_document: "))
|
||||
@pytest.mark.parametrize(
|
||||
"filename,content",
|
||||
(
|
||||
|
|
@ -265,7 +266,7 @@ def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):
|
|||
),
|
||||
),
|
||||
)
|
||||
def test_embed_multi_file_input(tmpdir, use_stdin, prefix, filename, content):
|
||||
def test_embed_multi_file_input(tmpdir, use_stdin, prefix, prepend, filename, content):
|
||||
db_path = tmpdir / "embeddings.db"
|
||||
args = ["embed-multi", "phrases", "-d", str(db_path), "-m", "embed-demo"]
|
||||
input = None
|
||||
|
|
@ -278,6 +279,8 @@ def test_embed_multi_file_input(tmpdir, use_stdin, prefix, filename, content):
|
|||
args.append(str(path))
|
||||
if prefix:
|
||||
args.extend(("--prefix", prefix))
|
||||
if prepend:
|
||||
args.extend(("--prepend", prepend))
|
||||
# Auto-detection can't detect JSON-nl, so make that explicit
|
||||
if filename.endswith(".jsonl"):
|
||||
args.extend(("--format", "nl"))
|
||||
|
|
@ -325,7 +328,8 @@ def test_embed_multi_files_binary_store(tmpdir):
|
|||
|
||||
@pytest.mark.parametrize("use_other_db", (True, False))
|
||||
@pytest.mark.parametrize("prefix", (None, "prefix"))
|
||||
def test_embed_multi_sql(tmpdir, use_other_db, prefix):
|
||||
@pytest.mark.parametrize("prepend", (None, "search_document: "))
|
||||
def test_embed_multi_sql(tmpdir, use_other_db, prefix, prepend):
|
||||
db_path = str(tmpdir / "embeddings.db")
|
||||
db = sqlite_utils.Database(db_path)
|
||||
extra_args = []
|
||||
|
|
@ -336,6 +340,8 @@ def test_embed_multi_sql(tmpdir, use_other_db, prefix):
|
|||
|
||||
if prefix:
|
||||
extra_args.extend(("--prefix", prefix))
|
||||
if prepend:
|
||||
extra_args.extend(("--prepend", prepend))
|
||||
|
||||
db["content"].insert_all(
|
||||
[
|
||||
|
|
@ -365,8 +371,14 @@ def test_embed_multi_sql(tmpdir, use_other_db, prefix):
|
|||
assert embeddings_db["embeddings"].count == 2
|
||||
rows = list(embeddings_db.query("select id, content from embeddings order by id"))
|
||||
assert rows == [
|
||||
{"id": (prefix or "") + "1", "content": "cli Command line interface"},
|
||||
{"id": (prefix or "") + "2", "content": "sql Structured query language"},
|
||||
{
|
||||
"id": (prefix or "") + "1",
|
||||
"content": (prepend or "") + "cli Command line interface",
|
||||
},
|
||||
{
|
||||
"id": (prefix or "") + "2",
|
||||
"content": (prepend or "") + "sql Structured query language",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -425,7 +437,8 @@ def multi_files(tmpdir):
|
|||
|
||||
@pytest.mark.xfail(sys.platform == "win32", reason="Expected to fail on Windows")
|
||||
@pytest.mark.parametrize("scenario", ("single", "multi"))
|
||||
def test_embed_multi_files(multi_files, scenario):
|
||||
@pytest.mark.parametrize("prepend", (None, "search_document: "))
|
||||
def test_embed_multi_files(multi_files, scenario, prepend):
|
||||
db_path, files = multi_files
|
||||
for filename, content in (
|
||||
("file1.txt", b"hello world"),
|
||||
|
|
@ -440,17 +453,23 @@ def test_embed_multi_files(multi_files, scenario):
|
|||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(content)
|
||||
|
||||
extra_args = []
|
||||
|
||||
if prepend:
|
||||
extra_args.extend(("--prepend", prepend))
|
||||
if scenario == "single":
|
||||
extra_args = ["--files", str(files), "**/*.txt"]
|
||||
extra_args.extend(["--files", str(files), "**/*.txt"])
|
||||
else:
|
||||
extra_args = [
|
||||
"--files",
|
||||
str(files / "nested" / "more"),
|
||||
"**/*.ini",
|
||||
"--files",
|
||||
str(files / "nested"),
|
||||
"*.txt",
|
||||
]
|
||||
extra_args.extend(
|
||||
[
|
||||
"--files",
|
||||
str(files / "nested" / "more"),
|
||||
"**/*.ini",
|
||||
"--files",
|
||||
str(files / "nested"),
|
||||
"*.txt",
|
||||
]
|
||||
)
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
|
|
@ -471,17 +490,20 @@ def test_embed_multi_files(multi_files, scenario):
|
|||
rows = list(embeddings_db.query("select id, content from embeddings order by id"))
|
||||
if scenario == "single":
|
||||
assert rows == [
|
||||
{"id": "file1.txt", "content": "hello world"},
|
||||
{"id": "file2.txt", "content": "goodbye world"},
|
||||
{"id": "nested/more/three.txt", "content": "three"},
|
||||
{"id": "nested/one.txt", "content": "one"},
|
||||
{"id": "nested/two.txt", "content": "two"},
|
||||
{"id": "file1.txt", "content": (prepend or "") + "hello world"},
|
||||
{"id": "file2.txt", "content": (prepend or "") + "goodbye world"},
|
||||
{"id": "nested/more/three.txt", "content": (prepend or "") + "three"},
|
||||
{"id": "nested/one.txt", "content": (prepend or "") + "one"},
|
||||
{"id": "nested/two.txt", "content": (prepend or "") + "two"},
|
||||
]
|
||||
else:
|
||||
assert rows == [
|
||||
{"id": "ignored.ini", "content": "Has weird \x96 character"},
|
||||
{"id": "one.txt", "content": "one"},
|
||||
{"id": "two.txt", "content": "two"},
|
||||
{
|
||||
"id": "ignored.ini",
|
||||
"content": (prepend or "") + "Has weird \x96 character",
|
||||
},
|
||||
{"id": "one.txt", "content": (prepend or "") + "one"},
|
||||
{"id": "two.txt", "content": (prepend or "") + "two"},
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue