From 484a1218660f3d9b115e89702752e70c97bb846a Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Sat, 17 Feb 2024 10:39:28 -0800 Subject: [PATCH] doc: improve embedding functions documentation (#983) Got some user feedback that the `implicit` / `explicit` distinction is confusing. Instead I was thinking we would just deprecate the `with_embeddings` API and then organize working with embeddings into 3 buckets: 1. manually generate embeddings 2. use a provided embedding function 3. define your own custom embedding function --- .../{api.md => custom_embedding_function.md} | 0 docs/src/embeddings/embedding_explicit.md | 141 ----------------- docs/src/embeddings/embedding_functions.md | 144 +++++++++++++----- docs/src/embeddings/index.md | 16 +- docs/src/embeddings/legacy.md | 99 ++++++++++++ docs/src/examples/modal_langchain.py | 1 - docs/src/notebooks/diffusiondb/datagen.py | 2 - python/lancedb/embeddings/utils.py | 3 +- python/lancedb/util.py | 24 +++ 9 files changed, 241 insertions(+), 189 deletions(-) rename docs/src/embeddings/{api.md => custom_embedding_function.md} (100%) delete mode 100644 docs/src/embeddings/embedding_explicit.md create mode 100644 docs/src/embeddings/legacy.md diff --git a/docs/src/embeddings/api.md b/docs/src/embeddings/custom_embedding_function.md similarity index 100% rename from docs/src/embeddings/api.md rename to docs/src/embeddings/custom_embedding_function.md diff --git a/docs/src/embeddings/embedding_explicit.md b/docs/src/embeddings/embedding_explicit.md deleted file mode 100644 index 1549a1a0..00000000 --- a/docs/src/embeddings/embedding_explicit.md +++ /dev/null @@ -1,141 +0,0 @@ -In this workflow, you define your own embedding function and pass it as a callable to LanceDB, invoking it in your code to generate the embeddings. Let's look at some examples. - -### Hugging Face - -!!! note - Currently, the Hugging Face method is only supported in the Python SDK. - -=== "Python" - The most popular open source option is to use the [sentence-transformers](https://www.sbert.net/) - library, which can be installed via pip. - - ```bash - pip install sentence-transformers - ``` - - The example below shows how to use the `paraphrase-albert-small-v2` model to generate embeddings - for a given document. - - ```python - from sentence_transformers import SentenceTransformer - - name="paraphrase-albert-small-v2" - model = SentenceTransformer(name) - - # used for both training and querying - def embed_func(batch): - return [model.encode(sentence) for sentence in batch] - ``` - -### OpenAI - -Another popular alternative is to use an external API like OpenAI's [embeddings API](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). - -=== "Python" - ```python - import openai - import os - - # Configuring the environment variable OPENAI_API_KEY - if "OPENAI_API_KEY" not in os.environ: - # OR set the key here as a variable - openai.api_key = "sk-..." - - # verify that the API key is working - assert len(openai.Model.list()["data"]) > 0 - - def embed_func(c): - rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002") - return [record["embedding"] for record in rs["data"]] - ``` - -=== "JavaScript" - ```javascript - const lancedb = require("vectordb"); - - // You need to provide an OpenAI API key - const apiKey = "sk-..." - // The embedding function will create embeddings for the 'text' column - const embedding = new lancedb.OpenAIEmbeddingFunction('text', apiKey) - ``` - -## Applying an embedding function to data - -=== "Python" - Using an embedding function, you can apply it to raw data - to generate embeddings for each record. - - Say you have a pandas DataFrame with a `text` column that you want embedded, - you can use the `with_embeddings` function to generate embeddings and add them to - an existing table. - - ```python - import pandas as pd - from lancedb.embeddings import with_embeddings - - df = pd.DataFrame( - [ - {"text": "pepperoni"}, - {"text": "pineapple"} - ] - ) - data = with_embeddings(embed_func, df) - - # The output is used to create / append to a table - # db.create_table("my_table", data=data) - ``` - - If your data is in a different column, you can specify the `column` kwarg to `with_embeddings`. - - By default, LanceDB calls the function with batches of 1000 rows. This can be configured - using the `batch_size` parameter to `with_embeddings`. - - LanceDB automatically wraps the function with retry and rate-limit logic to ensure the OpenAI - API call is reliable. - -=== "JavaScript" - Using an embedding function, you can apply it to raw data - to generate embeddings for each record. - - Simply pass the embedding function created above and LanceDB will use it to generate - embeddings for your data. - - ```javascript - const db = await lancedb.connect("data/sample-lancedb"); - const data = [ - { text: "pepperoni"}, - { text: "pineapple"} - ] - - const table = await db.createTable("vectors", data, embedding) - ``` - -## Querying using an embedding function - -!!! warning - At query time, you **must** use the same embedding function you used to vectorize your data. - If you use a different embedding function, the embeddings will not reside in the same vector - space and the results will be nonsensical. - -=== "Python" - ```python - query = "What's the best pizza topping?" - query_vector = embed_func([query])[0] - results = ( - tbl.search(query_vector) - .limit(10) - .to_pandas() - ) - ``` - - The above snippet returns a pandas DataFrame with the 10 closest vectors to the query. - -=== "JavaScript" - ```javascript - const results = await table - .search("What's the best pizza topping?") - .limit(10) - .execute() - ``` - - The above snippet returns an array of records with the top 10 nearest neighbors to the query. \ No newline at end of file diff --git a/docs/src/embeddings/embedding_functions.md b/docs/src/embeddings/embedding_functions.md index ab0c90bd..aac82e8c 100644 --- a/docs/src/embeddings/embedding_functions.md +++ b/docs/src/embeddings/embedding_functions.md @@ -3,61 +3,126 @@ Representing multi-modal data as vector embeddings is becoming a standard practi For this purpose, LanceDB introduces an **embedding functions API**, that allow you simply set up once, during the configuration stage of your project. After this, the table remembers it, effectively making the embedding functions *disappear in the background* so you don't have to worry about manually passing callables, and instead, simply focus on the rest of your data engineering pipeline. !!! warning - Using the implicit embeddings management approach means that you can forget about the manually passing around embedding - functions in your code, as long as you don't intend to change it at a later time. If your embedding function changes, - you'll have to re-configure your table with the new embedding function and regenerate the embeddings. + Using the embedding function registry means that you don't have to explicitly generate the embeddings yourself. + However, if your embedding function changes, you'll have to re-configure your table with the new embedding function + and regenerate the embeddings. In the future, we plan to support the ability to change the embedding function via + table metadata and have LanceDB automatically take care of regenerating the embeddings. + ## 1. Define the embedding function -We have some pre-defined embedding functions in the global registry, with more coming soon. Here's let's an implementation of CLIP as example. -``` -registry = EmbeddingFunctionRegistry.get_instance() -clip = registry.get("open-clip").create() -``` -You can also define your own embedding function by implementing the `EmbeddingFunction` abstract base interface. It subclasses Pydantic Model which can be utilized to write complex schemas simply as we'll see next! +=== "Python" + In the LanceDB python SDK, we define a global embedding function registry with + many different embedding models and even more coming soon. + Here's let's an implementation of CLIP as example. + + ```python + from lancedb.embeddings import get_registry + + registry = get_registry() + clip = registry.get("open-clip").create() + ``` + + You can also define your own embedding function by implementing the `EmbeddingFunction` + abstract base interface. It subclasses Pydantic Model which can be utilized to write complex schemas simply as we'll see next! + +=== "JavaScript"" + In the TypeScript SDK, the choices are more limited. For now, only the OpenAI + embedding function is available. + + ```javascript + const lancedb = require("vectordb"); + + // You need to provide an OpenAI API key + const apiKey = "sk-..." + // The embedding function will create embeddings for the 'text' column + const embedding = new lancedb.OpenAIEmbeddingFunction('text', apiKey) + ``` ## 2. Define the data model or schema -The embedding function defined above abstracts away all the details about the models and dimensions required to define the schema. You can simply set a field as **source** or **vector** column. Here's how: -```python -class Pets(LanceModel): - vector: Vector(clip.ndims) = clip.VectorField() - image_uri: str = clip.SourceField() -``` +=== "Python" + The embedding function defined above abstracts away all the details about the models and dimensions required to define the schema. You can simply set a field as **source** or **vector** column. Here's how: -`VectorField` tells LanceDB to use the clip embedding function to generate query embeddings for the `vector` column and `SourceField` ensures that when adding data, we automatically use the specified embedding function to encode `image_uri`. + ```python + class Pets(LanceModel): + vector: Vector(clip.ndims) = clip.VectorField() + image_uri: str = clip.SourceField() + ``` -## 3. Create LanceDB table -Now that we have chosen/defined our embedding function and the schema, we can create the table: + `VectorField` tells LanceDB to use the clip embedding function to generate query embeddings for the `vector` column and `SourceField` ensures that when adding data, we automatically use the specified embedding function to encode `image_uri`. -```python -db = lancedb.connect("~/lancedb") -table = db.create_table("pets", schema=Pets) +=== "JavaScript" -``` + For the TypeScript SDK, a schema can be inferred from input data, or an explicit + Arrow schema can be provided. -That's it! We've provided all the information needed to embed the source and query inputs. We can now forget about the model and dimension details and start to build our VectorDB pipeline. +## 3. Create table and add data -## 4. Ingest lots of data and query your table -Any new or incoming data can just be added and it'll be vectorized automatically. +Now that we have chosen/defined our embedding function and the schema, +we can create the table and ingest data without needing to explicitly generate +the embeddings at all: -```python -table.add([{"image_uri": u} for u in uris]) -``` +=== "Python" + ```python + db = lancedb.connect("~/lancedb") + table = db.create_table("pets", schema=Pets) -Our OpenCLIP query embedding function supports querying via both text and images: + table.add([{"image_uri": u} for u in uris]) + ``` -```python -result = table.search("dog") -``` +=== "JavaScript" -Let's query an image: + ```javascript + const db = await lancedb.connect("data/sample-lancedb"); + const data = [ + { text: "pepperoni"}, + { text: "pineapple"} + ] -```python -p = Path("path/to/images/samoyed_100.jpg") -query_image = Image.open(p) -table.search(query_image) -``` + const table = await db.createTable("vectors", data, embedding) + ``` + +## 4. Querying your table +Not only can you forget about the embeddings during ingestion, you also don't +need to worry about it when you query the table: + +=== "Python" + + Our OpenCLIP query embedding function supports querying via both text and images: + + ```python + results = ( + table.search("dog") + .limit(10) + .to_pandas() + ) + ``` + + Or we can search using an image: + + ```python + p = Path("path/to/images/samoyed_100.jpg") + query_image = Image.open(p) + results = ( + table.search(query_image) + .limit(10) + .to_pandas() + ) + ``` + + Both of the above snippet returns a pandas DataFrame with the 10 closest vectors to the query. + +=== "JavaScript" + + ```javascript + const results = await table + .search("What's the best pizza topping?") + .limit(10) + .execute() + ``` + + The above snippet returns an array of records with the top 10 nearest neighbors to the query. --- @@ -100,4 +165,5 @@ rs[2].image ![](../assets/dog_clip_output.png) -Now that you have the basic idea about implicit management via embedding functions, let's dive deeper into a [custom API](./api.md) that you can use to implement your own embedding functions. \ No newline at end of file +Now that you have the basic idea about LanceDB embedding functions and the embedding function registry, +let's dive deeper into defining your own [custom functions](./custom_embedding_function.md). \ No newline at end of file diff --git a/docs/src/embeddings/index.md b/docs/src/embeddings/index.md index d11202c9..bd4bbbf9 100644 --- a/docs/src/embeddings/index.md +++ b/docs/src/embeddings/index.md @@ -1,8 +1,14 @@ -Due to the nature of vector embeddings, they can be used to represent any kind of data, from text to images to audio. This makes them a very powerful tool for machine learning practitioners. However, there's no one-size-fits-all solution for generating embeddings - there are many different libraries and APIs (both commercial and open source) that can be used to generate embeddings from structured/unstructured data. +Due to the nature of vector embeddings, they can be used to represent any kind of data, from text to images to audio. +This makes them a very powerful tool for machine learning practitioners. +However, there's no one-size-fits-all solution for generating embeddings - there are many different libraries and APIs +(both commercial and open source) that can be used to generate embeddings from structured/unstructured data. -LanceDB supports 2 methods of vectorizing your raw data into embeddings. +LanceDB supports 3 methods of working with embeddings. -1. **Explicit**: By manually calling LanceDB's `with_embedding` function to vectorize your data via an `embed_func` of your choice -2. **Implicit**: Allow LanceDB to embed the data and queries in the background as they come in, by using the table's `EmbeddingRegistry` information +1. You can manually generate embeddings for the data and queries. This is done outside of LanceDB. +2. You can use the built-in [embedding functions](./embedding_functions.md) to embed the data and queries in the background. +3. For python users, you can define your own [custom embedding function](./custom_embedding_function.md) + that extends the default embedding functions. -See the [explicit](embedding_explicit.md) and [implicit](embedding_functions.md) embedding sections for more details. \ No newline at end of file +For python users, there is also a legacy [with_embeddings API](./legacy.md). +It is retained for compatibility and will be removed in a future version. \ No newline at end of file diff --git a/docs/src/embeddings/legacy.md b/docs/src/embeddings/legacy.md new file mode 100644 index 00000000..a22ab0f4 --- /dev/null +++ b/docs/src/embeddings/legacy.md @@ -0,0 +1,99 @@ +The legacy `with_embeddings` API is for Python only and is deprecated. + +### Hugging Face + +The most popular open source option is to use the [sentence-transformers](https://www.sbert.net/) +library, which can be installed via pip. + +```bash +pip install sentence-transformers +``` + +The example below shows how to use the `paraphrase-albert-small-v2` model to generate embeddings +for a given document. + +```python +from sentence_transformers import SentenceTransformer + +name="paraphrase-albert-small-v2" +model = SentenceTransformer(name) + +# used for both training and querying +def embed_func(batch): + return [model.encode(sentence) for sentence in batch] +``` + + +### OpenAI + +Another popular alternative is to use an external API like OpenAI's [embeddings API](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). + +```python +import openai +import os + +# Configuring the environment variable OPENAI_API_KEY +if "OPENAI_API_KEY" not in os.environ: +# OR set the key here as a variable +openai.api_key = "sk-..." + +client = openai.OpenAI() + +def embed_func(c): + rs = client.embeddings.create(input=c, model="text-embedding-ada-002") + return [record.embedding for record in rs["data"]] +``` + + +## Applying an embedding function to data + +Using an embedding function, you can apply it to raw data +to generate embeddings for each record. + +Say you have a pandas DataFrame with a `text` column that you want embedded, +you can use the `with_embeddings` function to generate embeddings and add them to +an existing table. + +```python + import pandas as pd + from lancedb.embeddings import with_embeddings + + df = pd.DataFrame( + [ + {"text": "pepperoni"}, + {"text": "pineapple"} + ] + ) + data = with_embeddings(embed_func, df) + + # The output is used to create / append to a table + tbl = db.create_table("my_table", data=data) +``` + +If your data is in a different column, you can specify the `column` kwarg to `with_embeddings`. + +By default, LanceDB calls the function with batches of 1000 rows. This can be configured +using the `batch_size` parameter to `with_embeddings`. + +LanceDB automatically wraps the function with retry and rate-limit logic to ensure the OpenAI +API call is reliable. + +## Querying using an embedding function + +!!! warning + At query time, you **must** use the same embedding function you used to vectorize your data. + If you use a different embedding function, the embeddings will not reside in the same vector + space and the results will be nonsensical. + +=== "Python" + ```python + query = "What's the best pizza topping?" + query_vector = embed_func([query])[0] + results = ( + tbl.search(query_vector) + .limit(10) + .to_pandas() + ) + ``` + + The above snippet returns a pandas DataFrame with the 10 closest vectors to the query. diff --git a/docs/src/examples/modal_langchain.py b/docs/src/examples/modal_langchain.py index 20c9960a..c664547e 100644 --- a/docs/src/examples/modal_langchain.py +++ b/docs/src/examples/modal_langchain.py @@ -1,6 +1,5 @@ import pickle import re -import sys import zipfile from pathlib import Path diff --git a/docs/src/notebooks/diffusiondb/datagen.py b/docs/src/notebooks/diffusiondb/datagen.py index 9218d9cf..42bcdf39 100755 --- a/docs/src/notebooks/diffusiondb/datagen.py +++ b/docs/src/notebooks/diffusiondb/datagen.py @@ -23,10 +23,8 @@ from multiprocessing import Pool import lance import pyarrow as pa from datasets import load_dataset -from PIL import Image from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast -import lancedb MODEL_ID = "openai/clip-vit-base-patch32" diff --git a/python/lancedb/embeddings/utils.py b/python/lancedb/embeddings/utils.py index 325145f4..ed9162ba 100644 --- a/python/lancedb/embeddings/utils.py +++ b/python/lancedb/embeddings/utils.py @@ -26,7 +26,7 @@ import pyarrow as pa from lance.vector import vec_to_table from retry import retry -from ..util import safe_import_pandas +from ..util import deprecated, safe_import_pandas from ..utils.general import LOGGER pd = safe_import_pandas() @@ -38,6 +38,7 @@ IMAGES = Union[ ] +@deprecated def with_embeddings( func: Callable, data: DATA, diff --git a/python/lancedb/util.py b/python/lancedb/util.py index 14f9e530..640dc4da 100644 --- a/python/lancedb/util.py +++ b/python/lancedb/util.py @@ -11,9 +11,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import importlib import os import pathlib +import warnings from datetime import date, datetime from functools import singledispatch from typing import Tuple, Union @@ -239,3 +241,25 @@ def _(value: list): @value_to_sql.register(np.ndarray) def _(value: np.ndarray): return value_to_sql(value.tolist()) + + +def deprecated(func): + """This is a decorator which can be used to mark functions + as deprecated. It will result in a warning being emitted + when the function is used.""" + + @functools.wraps(func) + def new_func(*args, **kwargs): + warnings.simplefilter("always", DeprecationWarning) # turn off filter + warnings.warn( + ( + f"Function {func.__name__} is deprecated and will be " + "removed in a future version" + ), + category=DeprecationWarning, + stacklevel=2, + ) + warnings.simplefilter("default", DeprecationWarning) # reset filter + return func(*args, **kwargs) + + return new_func