mirror of
https://github.com/Hopiu/llm.git
synced 2026-03-23 07:50:24 +00:00
Default embedding model finishing touches, closes #222
This commit is contained in:
parent
8ce7046d13
commit
0eda99e91c
6 changed files with 87 additions and 23 deletions
|
|
@ -13,21 +13,24 @@ The `llm embed` command can be used to calculate embedding vectors for a string
|
|||
The simplest way to use this command is to pass content to it using the `-c/--content` option, like this:
|
||||
|
||||
```bash
|
||||
llm embed -c 'This is some content'
|
||||
llm embed -c 'This is some content' -m ada-002
|
||||
```
|
||||
The command will return a JSON array of floating point numbers directly to the terminal:
|
||||
`-m ada-002` specifies the OpenAI `ada-002` model. You will need to have set an OpenAI API key using `llm keys set openai` for this to work.
|
||||
|
||||
You can install plugins to access other models. The [llm-sentence-transformers](https://github.com/simonw/llm-sentence-transformers) plugin can be used to run models on your own laptop, such as the [MiniLM-L6](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) model:
|
||||
|
||||
```bash
|
||||
llm install llm-sentence-transformers
|
||||
llm embed -c 'This is some content' -m sentence-transformers/all-MiniLM-L6-v2
|
||||
```
|
||||
|
||||
The `llm embed` command returns a JSON array of floating point numbers directly to the terminal:
|
||||
|
||||
```json
|
||||
[0.123, 0.456, 0.789...]
|
||||
```
|
||||
By default it uses the {ref}`default embedding model <embeddings-cli-embed-models-default>`.
|
||||
You can omit the `-m/--model` option if you set a {ref}`default embedding model <embeddings-cli-embed-models-default>`.
|
||||
|
||||
Use the `-m/--model` option to specify a different model:
|
||||
|
||||
```bash
|
||||
llm -m sentence-transformers/all-MiniLM-L6-v2 \
|
||||
-c 'This is some content'
|
||||
```
|
||||
See {ref}`embeddings-binary` for options to get back embeddings in formats other than JSON.
|
||||
|
||||
(embeddings-collections)=
|
||||
|
|
@ -37,6 +40,11 @@ Embeddings are much more useful if you store them somewhere, so you can calculat
|
|||
|
||||
LLM includes the concept of a "collection" of embeddings. A collection groups together a set of stored embeddings created using the same model, each with a unique ID within that collection.
|
||||
|
||||
First, we'll set a default model so we don't have to keep repeating it:
|
||||
```bash
|
||||
llm embed-models default ada-002
|
||||
```
|
||||
|
||||
The `llm embed` command can store results directly in a named collection like this:
|
||||
|
||||
```bash
|
||||
|
|
|
|||
|
|
@ -106,6 +106,13 @@ A collection instance has the following properties and methods:
|
|||
- `similar_by_vector(vector: List[float], number: int=10, skip_id: str=None)` - returns a list of entries that are most similar to the given embedding vector, optionally skipping the entry with the given ID
|
||||
- `delete()` - deletes the collection and its embeddings from the database
|
||||
|
||||
There is also a `Collection.exists(db, name)` class method which returns a boolean value and can be used to determine if a collection exists or not in a database:
|
||||
|
||||
```python
|
||||
if Collection.exists(db, "entries"):
|
||||
print("The entries collection exists")
|
||||
```
|
||||
|
||||
(embeddings-python-similar)=
|
||||
## Retrieving similar items
|
||||
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ def get_embedding_model(name):
|
|||
try:
|
||||
return aliases[name]
|
||||
except KeyError:
|
||||
raise UnknownModelError("Unknown model: " + name)
|
||||
raise UnknownModelError("Unknown model: " + str(name))
|
||||
|
||||
|
||||
def get_embedding_model_aliases() -> Dict[str, EmbeddingModel]:
|
||||
|
|
|
|||
28
llm/cli.py
28
llm/cli.py
|
|
@ -929,20 +929,28 @@ def embed(collection, id, input, model, store, database, content, metadata, form
|
|||
model_obj = None
|
||||
if collection:
|
||||
db = get_db()
|
||||
collection_obj = Collection(collection, db, model_id=model)
|
||||
model_obj = collection_obj.model()
|
||||
if Collection.exists(db, collection):
|
||||
# Load existing collection and use its model
|
||||
collection_obj = Collection(collection, db)
|
||||
model_obj = collection_obj.model()
|
||||
else:
|
||||
# We will create a new one, but that means model is required
|
||||
if not model:
|
||||
model = get_default_embedding_model()
|
||||
if model is None:
|
||||
raise click.ClickException(
|
||||
"You need to specify a model (no default model is set)"
|
||||
)
|
||||
collection_obj = Collection(collection, db=db, model_id=model)
|
||||
model_obj = collection_obj.model()
|
||||
|
||||
if model_obj is None:
|
||||
if not model:
|
||||
model = get_default_embedding_model()
|
||||
if model is None:
|
||||
raise click.ClickException(
|
||||
"You need to specify a model (no default model is set)"
|
||||
)
|
||||
try:
|
||||
model_obj = get_embedding_model(model)
|
||||
except UnknownModelError as ex:
|
||||
raise click.ClickException(str(ex))
|
||||
except UnknownModelError:
|
||||
raise click.ClickException(
|
||||
"You need to specify a model (no default model is set)"
|
||||
)
|
||||
|
||||
show_output = True
|
||||
if collection and (format_ is None):
|
||||
|
|
|
|||
|
|
@ -288,6 +288,17 @@ class Collection:
|
|||
comparison_vector = self.model().embed(text)
|
||||
return self.similar_by_vector(comparison_vector, number)
|
||||
|
||||
@classmethod
|
||||
def exists(cls, db: Database, name: str) -> bool:
|
||||
"""
|
||||
Does this collection exist in the database?
|
||||
|
||||
Args:
|
||||
name (str): Name of the collection
|
||||
"""
|
||||
rows = list(db["collections"].rows_where("name = ?", [name]))
|
||||
return bool(rows)
|
||||
|
||||
def delete(self):
|
||||
"""
|
||||
Delete the collection and its embeddings from the database
|
||||
|
|
|
|||
|
|
@ -102,9 +102,8 @@ def test_embed_store(user_path, metadata, metadata_error):
|
|||
assert embeddings_db.exists()
|
||||
# Check the contents
|
||||
db = sqlite_utils.Database(str(embeddings_db))
|
||||
assert list(db["collections"].rows) == [
|
||||
{"id": 1, "name": "items", "model": "embed-demo"}
|
||||
]
|
||||
rows = list(db["collections"].rows)
|
||||
assert rows == [{"id": 1, "name": "items", "model": "embed-demo"}]
|
||||
expected_metadata = None
|
||||
if metadata and not metadata_error:
|
||||
expected_metadata = metadata
|
||||
|
|
@ -390,3 +389,34 @@ def test_default_embedding_model():
|
|||
result5 = runner.invoke(cli, ["embed-models", "default"])
|
||||
assert result5.exit_code == 0
|
||||
assert result5.output == "<No default embedding model set>\n"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("default_is_set", (False, True))
|
||||
@pytest.mark.parametrize("command", ("embed", "embed-multi"))
|
||||
def test_default_embed_model_errors(user_path, default_is_set, command):
|
||||
runner = CliRunner()
|
||||
if default_is_set:
|
||||
(user_path / "default_embedding_model.txt").write_text(
|
||||
"embed-demo", encoding="utf8"
|
||||
)
|
||||
args = []
|
||||
input = None
|
||||
if command == "embed-multi":
|
||||
args = ["embed-multi", "example", "-"]
|
||||
input = "id,name\n1,hello"
|
||||
else:
|
||||
args = ["embed", "example", "1", "-c", "hello world"]
|
||||
result = runner.invoke(cli, args, input=input, catch_exceptions=False)
|
||||
if default_is_set:
|
||||
assert result.exit_code == 0
|
||||
else:
|
||||
assert result.exit_code == 1
|
||||
assert "You need to specify a model (no default model is set)" in result.output
|
||||
# Now set the default model and try again
|
||||
result2 = runner.invoke(cli, ["embed-models", "default", "embed-demo"])
|
||||
assert result2.exit_code == 0
|
||||
result3 = runner.invoke(cli, args, input=input, catch_exceptions=False)
|
||||
assert result3.exit_code == 0
|
||||
# At the end of this, there should be 2 embeddings
|
||||
db = sqlite_utils.Database(str(user_path / "embeddings.db"))
|
||||
assert db["embeddings"].count == 1
|
||||
|
|
|
|||
Loading…
Reference in a new issue