From b039765d50bea25c9d9fe6f922e686b4ed4f7709 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Thu, 11 Apr 2024 17:30:45 +0530 Subject: [PATCH] docs : Embedding functions quickstart and minor fixes (#1217) --- .../embeddings/default_embedding_functions.md | 37 +++++++---- docs/src/embeddings/index.md | 62 ++++++++++++++++++- python/pyproject.toml | 1 - 3 files changed, 87 insertions(+), 13 deletions(-) diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md index 67422025..d04be52e 100644 --- a/docs/src/embeddings/default_embedding_functions.md +++ b/docs/src/embeddings/default_embedding_functions.md @@ -154,9 +154,12 @@ Allows you to set parameters when registering a `sentence-transformers` object. !!! note "BAAI Embeddings example" Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers) ```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import get_registry + db = lancedb.connect("/tmp/db") - registry = EmbeddingFunctionRegistry.get_instance() - model = registry.get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") + model = get_registry.get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") class Words(LanceModel): text: str = model.SourceField() @@ -165,7 +168,7 @@ Allows you to set parameters when registering a `sentence-transformers` object. table = db.create_table("words", schema=Words) table.add( [ - {"text": "hello world"} + {"text": "hello world"}, {"text": "goodbye world"} ] ) @@ -213,18 +216,21 @@ LanceDB registers the OpenAI embeddings function in the registry by default, as ```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + db = lancedb.connect("/tmp/db") -registry = EmbeddingFunctionRegistry.get_instance() -func = registry.get("openai").create() +func = get_registry().get("openai").create(name="text-embedding-ada-002") class Words(LanceModel): text: str = func.SourceField() vector: Vector(func.ndims()) = func.VectorField() -table = db.create_table("words", schema=Words) +table = db.create_table("words", schema=Words, mode="overwrite") table.add( [ - {"text": "hello world"} + {"text": "hello world"}, {"text": "goodbye world"} ] ) @@ -353,6 +359,10 @@ Supported parameters (to be passed in `create` method) are: Usage Example: ```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + model = get_registry().get("bedrock-text").create() class TextModel(LanceModel): @@ -387,10 +397,12 @@ This embedding function supports ingesting images as both bytes and urls. You ca LanceDB supports ingesting images directly from accessible links. ```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry db = lancedb.connect(tmp_path) -registry = EmbeddingFunctionRegistry.get_instance() -func = registry.get("open-clip").create() +func = get_registry.get("open-clip").create() class Images(LanceModel): label: str @@ -465,9 +477,12 @@ This function is registered as `imagebind` and supports Audio, Video and Text mo Below is an example demonstrating how the API works: ```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + db = lancedb.connect(tmp_path) -registry = EmbeddingFunctionRegistry.get_instance() -func = registry.get("imagebind").create() +func = get_registry.get("imagebind").create() class ImageBindModel(LanceModel): text: str diff --git a/docs/src/embeddings/index.md b/docs/src/embeddings/index.md index bd4bbbf9..0752cabe 100644 --- a/docs/src/embeddings/index.md +++ b/docs/src/embeddings/index.md @@ -11,4 +11,64 @@ LanceDB supports 3 methods of working with embeddings. that extends the default embedding functions. For python users, there is also a legacy [with_embeddings API](./legacy.md). -It is retained for compatibility and will be removed in a future version. \ No newline at end of file +It is retained for compatibility and will be removed in a future version. + +## Quickstart + +To get started with embeddings, you can use the built-in embedding functions. + +### OpenAI Embedding function +LanceDB registers the OpenAI embeddings function in the registry as `openai`. You can pass any supported model name to the `create`. By default it uses `"text-embedding-ada-002"`. + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect("/tmp/db") +func = get_registry().get("openai").create(name="text-embedding-ada-002") + +class Words(LanceModel): + text: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() + +table = db.create_table("words", schema=Words, mode="overwrite") +table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] + ) + +query = "greetings" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` + +### Sentence Transformers Embedding function +LanceDB registers the Sentence Transformers embeddings function in the registry as `sentence-transformers`. You can pass any supported model name to the `create`. By default it uses `"sentence-transformers/paraphrase-MiniLM-L6-v2"`. + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect("/tmp/db") +model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") + +class Words(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +table = db.create_table("words", schema=Words) +table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] +) + +query = "greetings" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` \ No newline at end of file diff --git a/python/pyproject.toml b/python/pyproject.toml index 4338373a..29a8b800 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -65,7 +65,6 @@ docs = [ "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]", - "mkdocs-ultralytics-plugin==0.0.44", ] clip = ["torch", "pillow", "open-clip"] embeddings = [