From ae85008714792a6b724c75793b63273c51caba88 Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:14:35 +0530 Subject: [PATCH] docs: revamp embedding models (#1568) before: ![Screenshot 2024-08-27 151525](https://github.com/user-attachments/assets/d4f8f2b9-37e6-4a31-b144-01b804019e11) After: ![Screenshot 2024-08-27 151550](https://github.com/user-attachments/assets/79fe7d27-8f14-4d80-9b41-a1e91f8c708f) --------- Co-authored-by: Ayush Chaurasia --- docs/mkdocs.yml | 36 +- .../imagebind_embedding.md | 67 ++ .../jina_multimodal_embedding.md | 51 ++ .../openclip_embedding.md | 82 ++ .../aws_bedrock_embedding.md | 51 ++ .../cohere_embedding.md | 62 ++ .../gemini_embedding.md | 35 + .../huggingface_embedding.md | 24 + .../ibm_watsonx_ai_embedding.md | 75 ++ .../instructor_embedding.md | 50 ++ .../jina_embedding.md | 39 + .../ollama_embedding.md | 37 + .../openai_embedding.md | 34 + .../sentence_transformers.md | 174 ++++ .../embeddings/default_embedding_functions.md | 802 +----------------- docs/test/md_testing.py | 2 + 16 files changed, 833 insertions(+), 788 deletions(-) create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md create mode 100644 docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 387db5c3..e4346bcd 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -119,7 +119,23 @@ nav: - 🧬 Managing embeddings: - Overview: embeddings/index.md - Embedding functions: embeddings/embedding_functions.md - - Available models: embeddings/default_embedding_functions.md + - Available models: + - Overview: embeddings/default_embedding_functions.md + - Text Embedding Functions: + - Sentence Transformers: embeddings\available_embedding_models\text_embedding_functions\sentence_transformers.md + - Huggingface Embedding Models: embeddings\available_embedding_models\text_embedding_functions\huggingface_embedding.md + - Ollama Embeddings: embeddings\available_embedding_models\text_embedding_functions\ollama_embedding.md + - OpenAI Embeddings: embeddings\available_embedding_models\text_embedding_functions\openai_embedding.md + - Instructor Embeddings: embeddings\available_embedding_models\text_embedding_functions\instructor_embedding.md + - Gemini Embeddings: embeddings\available_embedding_models\text_embedding_functions\gemini_embedding.md + - Cohere Embeddings: embeddings\available_embedding_models\text_embedding_functions\cohere_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\text_embedding_functions\jina_embedding.md + - AWS Bedrock Text Embedding Functions: embeddings\available_embedding_models\text_embedding_functions\aws_bedrock_embedding.md + - IBM watsonx.ai Embeddings: embeddings\available_embedding_models\text_embedding_functions\ibm_watsonx_ai_embedding.md + - Multimodal Embedding Functions: + - OpenClip embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\openclip_embedding.md + - Imagebind embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\imagebind_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\jina_multimodal_embedding.md - User-defined embedding functions: embeddings/custom_embedding_function.md - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb @@ -215,7 +231,23 @@ nav: - Managing Embeddings: - Overview: embeddings/index.md - Embedding functions: embeddings/embedding_functions.md - - Available models: embeddings/default_embedding_functions.md + - Available models: + - Overview: embeddings/default_embedding_functions.md + - Text Embedding Functions: + - Sentence Transformers: embeddings\available_embedding_models\text_embedding_functions\sentence_transformers.md + - Huggingface Embedding Models: embeddings\available_embedding_models\text_embedding_functions\huggingface_embedding.md + - Ollama Embeddings: embeddings\available_embedding_models\text_embedding_functions\ollama_embedding.md + - OpenAI Embeddings: embeddings\available_embedding_models\text_embedding_functions\openai_embedding.md + - Instructor Embeddings: embeddings\available_embedding_models\text_embedding_functions\instructor_embedding.md + - Gemini Embeddings: embeddings\available_embedding_models\text_embedding_functions\gemini_embedding.md + - Cohere Embeddings: embeddings\available_embedding_models\text_embedding_functions\cohere_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\text_embedding_functions\jina_embedding.md + - AWS Bedrock Text Embedding Functions: embeddings\available_embedding_models\text_embedding_functions\aws_bedrock_embedding.md + - IBM watsonx.ai Embeddings: embeddings\available_embedding_models\text_embedding_functions\ibm_watsonx_ai_embedding.md + - Multimodal Embedding Functions: + - OpenClip embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\openclip_embedding.md + - Imagebind embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\imagebind_embedding.md + - Jina Embeddings: embeddings\available_embedding_models\multimodal_embedding_functions\jina_multimodal_embedding.md - User-defined embedding functions: embeddings/custom_embedding_function.md - "Example: Multi-lingual semantic search": notebooks/multi_lingual_example.ipynb - "Example: MultiModal CLIP Embeddings": notebooks/DisappearingEmbeddingFunction.ipynb diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md new file mode 100644 index 00000000..4aa8b3db --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md @@ -0,0 +1,67 @@ +# Imagebind embeddings +We have support for [imagebind](https://github.com/facebookresearch/ImageBind) model embeddings. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. + +This function is registered as `imagebind` and supports Audio, Video and Text modalities(extending to Thermal,Depth,IMU data): + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"imagebind_huge"` | Name of the model. | +| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | +| `normalize` | `bool` | `False` | set to `True` to normalize your inputs before model ingestion. | + +Below is an example demonstrating how the API works: + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect(tmp_path) +func = get_registry.get("imagebind").create() + +class ImageBindModel(LanceModel): + text: str + image_uri: str = func.SourceField() + audio_path: str + vector: Vector(func.ndims()) = func.VectorField() + +# add locally accessible image paths +text_list=["A dog.", "A car", "A bird"] +image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"] +audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"] + +# Load data +inputs = [ + {"text": a, "audio_path": b, "image_uri": c} + for a, b, c in zip(text_list, audio_paths, image_paths) +] + +#create table and add data +table = db.create_table("img_bind", schema=ImageBindModel) +table.add(inputs) +``` + +Now, we can search using any modality: + +#### image search +```python +query_image = "./assets/dog_image2.jpg" #download an image and enter that path here +actual = table.search(query_image).limit(1).to_pydantic(ImageBindModel)[0] +print(actual.text == "dog") +``` +#### audio search + +```python +query_audio = "./assets/car_audio2.wav" #download an audio clip and enter path here +actual = table.search(query_audio).limit(1).to_pydantic(ImageBindModel)[0] +print(actual.text == "car") +``` +#### Text search +You can add any input query and fetch the result as follows: +```python +query = "an animal which flies and tweets" +actual = table.search(query).limit(1).to_pydantic(ImageBindModel)[0] +print(actual.text == "bird") +``` + +If you have any questions about the embeddings API, supported models, or see a relevant model missing, please raise an issue [on GitHub](https://github.com/lancedb/lancedb/issues). diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md new file mode 100644 index 00000000..918c1509 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md @@ -0,0 +1,51 @@ +# Jina Embeddings : Multimodal + +Jina embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list +under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | + +Usage Example: + +```python + import os + import requests + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import get_registry + import pandas as pd + + os.environ['JINA_API_KEY'] = 'jina_*' + + db = lancedb.connect("~/.lancedb") + func = get_registry().get("jina").create() + + + class Images(LanceModel): + label: str + image_uri: str = func.SourceField() # image uri as the source + image_bytes: bytes = func.SourceField() # image bytes as the source + vector: Vector(func.ndims()) = func.VectorField() # vector column + vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column + + + table = db.create_table("images", schema=Images) + labels = ["cat", "cat", "dog", "dog", "horse", "horse"] + uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", + "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", + "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", + ] + # get each uri as bytes + image_bytes = [requests.get(uri).content for uri in uris] + table.add( + pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) + ) +``` diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md new file mode 100644 index 00000000..bf50dfd2 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md @@ -0,0 +1,82 @@ +# OpenClip embeddings +We support CLIP model embeddings using the open source alternative, [open-clip](https://github.com/mlfoundations/open_clip) which supports various customizations. It is registered as `open-clip` and supports the following customizations: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"ViT-B-32"` | The name of the model. | +| `pretrained` | `str` | `"laion2b_s34b_b79k"` | The name of the pretrained model to load. | +| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | +| `batch_size` | `int` | `64` | The number of images to process in a batch. | +| `normalize` | `bool` | `True` | Whether to normalize the input images before feeding them to the model. | + +This embedding function supports ingesting images as both bytes and urls. You can query them using both test and other images. + +!!! info + LanceDB supports ingesting images directly from accessible links. + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect(tmp_path) +func = get_registry.get("open-clip").create() + +class Images(LanceModel): + label: str + image_uri: str = func.SourceField() # image uri as the source + image_bytes: bytes = func.SourceField() # image bytes as the source + vector: Vector(func.ndims()) = func.VectorField() # vector column + vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column + +table = db.create_table("images", schema=Images) +labels = ["cat", "cat", "dog", "dog", "horse", "horse"] +uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", + "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", + "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", +] +# get each uri as bytes +image_bytes = [requests.get(uri).content for uri in uris] +table.add( + pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) +) +``` +Now we can search using text from both the default vector column and the custom vector column +```python + +# text search +actual = table.search("man's best friend").limit(1).to_pydantic(Images)[0] +print(actual.label) # prints "dog" + +frombytes = ( + table.search("man's best friend", vector_column_name="vec_from_bytes") + .limit(1) + .to_pydantic(Images)[0] +) +print(frombytes.label) + +``` + +Because we're using a multi-modal embedding function, we can also search using images + +```python +# image search +query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg" +image_bytes = requests.get(query_image_uri).content +query_image = Image.open(io.BytesIO(image_bytes)) +actual = table.search(query_image).limit(1).to_pydantic(Images)[0] +print(actual.label == "dog") + +# image search using a custom vector column +other = ( + table.search(query_image, vector_column_name="vec_from_bytes") + .limit(1) + .to_pydantic(Images)[0] +) +print(actual.label) + +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md new file mode 100644 index 00000000..036d4b82 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md @@ -0,0 +1,51 @@ +# AWS Bedrock Text Embedding Functions + +AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. +You can do so by using `awscli` and also add your session_token: +```shell +aws configure +aws configure set aws_session_token "" +``` +to ensure that the credentials are set up correctly, you can run the following command: +```shell +aws sts get-caller-identity +``` + +Supported Embedding modelIDs are: +* `amazon.titan-embed-text-v1` +* `cohere.embed-english-v3` +* `cohere.embed-multilingual-v3` + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| **name** | str | "amazon.titan-embed-text-v1" | The model ID of the bedrock model to use. Supported base models for Text Embeddings: amazon.titan-embed-text-v1, cohere.embed-english-v3, cohere.embed-multilingual-v3 | +| **region** | str | "us-east-1" | Optional name of the AWS Region in which the service should be called (e.g., "us-east-1"). | +| **profile_name** | str | None | Optional name of the AWS profile to use for calling the Bedrock service. If not specified, the default profile will be used. | +| **assumed_role** | str | None | Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not specified, the current active credentials will be used. | +| **role_session_name** | str | "lancedb-embeddings" | Optional name of the AWS IAM role session to use for calling the Bedrock service. If not specified, a "lancedb-embeddings" name will be used. | +| **runtime** | bool | True | Optional choice of getting different client to perform operations with the Amazon Bedrock service. | +| **max_retries** | int | 7 | Optional number of retries to perform when a request fails. | + +Usage Example: + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry +import pandas as pd + +model = get_registry().get("bedrock-text").create() + +class TextModel(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) +db = lancedb.connect("tmp_path") +tbl = db.create_table("test", schema=TextModel, mode="overwrite") + +tbl.add(df) +rs = tbl.search("hello").limit(1).to_pandas() +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md new file mode 100644 index 00000000..39eba18c --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md @@ -0,0 +1,62 @@ +# Cohere Embeddings + +Using cohere API requires cohere package, which can be installed using `pip install cohere`. Cohere embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification. +You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API. + +Supported models are: +* embed-english-v3.0 +* embed-multilingual-v3.0 +* embed-english-light-v3.0 +* embed-multilingual-light-v3.0 +* embed-english-v2.0 +* embed-english-light-v2.0 +* embed-multilingual-v2.0 + + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|--------|---------| +| `name` | `str` | `"embed-english-v2.0"` | The model ID of the cohere model to use. Supported base models for Text Embeddings: embed-english-v3.0, embed-multilingual-v3.0, embed-english-light-v3.0, embed-multilingual-light-v3.0, embed-english-v2.0, embed-english-light-v2.0, embed-multilingual-v2.0 | +| `source_input_type` | `str` | `"search_document"` | The type of input data to be used for the source column. | +| `query_input_type` | `str` | `"search_query"` | The type of input data to be used for the query. | + +Cohere supports following input types: + +| Input Type | Description | +|-------------------------|---------------------------------------| +| "`search_document`" | Used for embeddings stored in a vector| +| | database for search use-cases. | +| "`search_query`" | Used for embeddings of search queries | +| | run against a vector DB | +| "`semantic_similarity`" | Specifies the given text will be used | +| | for Semantic Textual Similarity (STS) | +| "`classification`" | Used for embeddings passed through a | +| | text classifier. | +| "`clustering`" | Used for the embeddings run through a | +| | clustering algorithm | + +Usage Example: + +```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import EmbeddingFunctionRegistry + + cohere = EmbeddingFunctionRegistry + .get_instance() + .get("cohere") + .create(name="embed-multilingual-v2.0") + + class TextModel(LanceModel): + text: str = cohere.SourceField() + vector: Vector(cohere.ndims()) = cohere.VectorField() + + data = [ { "text": "hello world" }, + { "text": "goodbye world" }] + + db = lancedb.connect("~/.lancedb") + tbl = db.create_table("test", schema=TextModel, mode="overwrite") + + tbl.add(data) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md new file mode 100644 index 00000000..551c8327 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/gemini_embedding.md @@ -0,0 +1,35 @@ +# Gemini Embeddings +With Google's Gemini, you can represent text (words, sentences, and blocks of text) in a vectorized form, making it easier to compare and contrast embeddings. For example, two texts that share a similar subject matter or sentiment should have similar embeddings, which can be identified through mathematical comparison techniques such as cosine similarity. For more on how and why you should use embeddings, refer to the Embeddings guide. +The Gemini Embedding Model API supports various task types: + +| Task Type | Description | +|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| +| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. | +| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API | +| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). | +| "`classification`" | Specifies that the embeddings will be used for classification. | +| "`clusering`" | Specifies that the embeddings will be used for clustering. | + + +Usage Example: + +```python +import lancedb +import pandas as pd +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + + +model = get_registry().get("gemini-text").create() + +class TextModel(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) +db = lancedb.connect("~/.lancedb") +tbl = db.create_table("test", schema=TextModel, mode="overwrite") + +tbl.add(df) +rs = tbl.search("hello").limit(1).to_pandas() +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md new file mode 100644 index 00000000..80502b49 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/huggingface_embedding.md @@ -0,0 +1,24 @@ +# Huggingface embedding models +We offer support for all huggingface models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")` + +Example usage - +```python +import lancedb +import pandas as pd + +from lancedb.embeddings import get_registry +from lancedb.pydantic import LanceModel, Vector + +model = get_registry().get("huggingface").create(name='facebook/bart-base') + +class Words(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + +df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]}) +table = db.create_table("greets", schema=Words) +table.add(df) +query = "old greeting" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md new file mode 100644 index 00000000..d98fdeef --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md @@ -0,0 +1,75 @@ +# IBM watsonx.ai Embeddings + +Generate text embeddings using IBM's watsonx.ai platform. + +## Supported Models + +You can find a list of supported models at [IBM watsonx.ai Documentation](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx). The currently supported model names are: + +- `ibm/slate-125m-english-rtrvr` +- `ibm/slate-30m-english-rtrvr` +- `sentence-transformers/all-minilm-l12-v2` +- `intfloat/multilingual-e5-large` + +## Parameters + +The following parameters can be passed to the `create` method: + +| Parameter | Type | Default Value | Description | +|------------|----------|----------------------------------|-----------------------------------------------------------| +| name | str | "ibm/slate-125m-english-rtrvr" | The model ID of the watsonx.ai model to use | +| api_key | str | None | Optional IBM Cloud API key (or set `WATSONX_API_KEY`) | +| project_id | str | None | Optional watsonx project ID (or set `WATSONX_PROJECT_ID`) | +| url | str | None | Optional custom URL for the watsonx.ai instance | +| params | dict | None | Optional additional parameters for the embedding model | + +## Usage Example + +First, the watsonx.ai library is an optional dependency, so must be installed seperately: + +``` +pip install ibm-watsonx-ai +``` + +Optionally set environment variables (if not passing credentials to `create` directly): + +```sh +export WATSONX_API_KEY="YOUR_WATSONX_API_KEY" +export WATSONX_PROJECT_ID="YOUR_WATSONX_PROJECT_ID" +``` + +```python +import os +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import EmbeddingFunctionRegistry + +watsonx_embed = EmbeddingFunctionRegistry + .get_instance() + .get("watsonx") + .create( + name="ibm/slate-125m-english-rtrvr", + # Uncomment and set these if not using environment variables + # api_key="your_api_key_here", + # project_id="your_project_id_here", + # url="your_watsonx_url_here", + # params={...}, + ) + +class TextModel(LanceModel): + text: str = watsonx_embed.SourceField() + vector: Vector(watsonx_embed.ndims()) = watsonx_embed.VectorField() + +data = [ + {"text": "hello world"}, + {"text": "goodbye world"}, +] + +db = lancedb.connect("~/.lancedb") +tbl = db.create_table("watsonx_test", schema=TextModel, mode="overwrite") + +tbl.add(data) + +rs = tbl.search("hello").limit(1).to_pandas() +print(rs) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md new file mode 100644 index 00000000..30662f21 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/instructor_embedding.md @@ -0,0 +1,50 @@ +# Instructor Embeddings +[Instructor](https://instructor-embedding.github.io/) is an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g. classification, retrieval, clustering, text evaluation, etc.) and domains (e.g. science, finance, etc.) by simply providing the task instruction, without any finetuning. + +If you want to calculate customized embeddings for specific sentences, you can follow the unified template to write instructions. + +!!! info + Represent the `domain` `text_type` for `task_objective`: + + * `domain` is optional, and it specifies the domain of the text, e.g. science, finance, medicine, etc. + * `text_type` is required, and it specifies the encoding unit, e.g. sentence, document, paragraph, etc. + * `task_objective` is optional, and it specifies the objective of embedding, e.g. retrieve a document, classify the sentence, etc. + +More information about the model can be found at the [source URL](https://github.com/xlang-ai/instructor-embedding). + +| Argument | Type | Default | Description | +|---|---|---|---| +| `name` | `str` | "hkunlp/instructor-base" | The name of the model to use | +| `batch_size` | `int` | `32` | The batch size to use when generating embeddings | +| `device` | `str` | `"cpu"` | The device to use when generating embeddings | +| `show_progress_bar` | `bool` | `True` | Whether to show a progress bar when generating embeddings | +| `normalize_embeddings` | `bool` | `True` | Whether to normalize the embeddings | +| `quantize` | `bool` | `False` | Whether to quantize the model | +| `source_instruction` | `str` | `"represent the docuement for retreival"` | The instruction for the source column | +| `query_instruction` | `str` | `"represent the document for retreiving the most similar documents"` | The instruction for the query | + + + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry, InstuctorEmbeddingFunction + +instructor = get_registry().get("instructor").create( + source_instruction="represent the docuement for retreival", + query_instruction="represent the document for retreiving the most similar documents" + ) + +class Schema(LanceModel): + vector: Vector(instructor.ndims()) = instructor.VectorField() + text: str = instructor.SourceField() + +db = lancedb.connect("~/.lancedb") +tbl = db.create_table("test", schema=Schema, mode="overwrite") + +texts = [{"text": "Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that..."}, + {"text": "The disparate impact theory is especially controversial under the Fair Housing Act because the Act..."}, + {"text": "Disparate impact in United States labor law refers to practices in employment, housing, and other areas that.."}] + +tbl.add(texts) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md new file mode 100644 index 00000000..dc194c5d --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md @@ -0,0 +1,39 @@ +# Jina Embeddings + +Jina embeddings are used to generate embeddings for text and image data. +You also need to set the `JINA_API_KEY` environment variable to use the Jina API. + +You can find a list of supported models under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | + +Usage Example: + +```python + import os + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import EmbeddingFunctionRegistry + + os.environ['JINA_API_KEY'] = 'jina_*' + + jina_embed = EmbeddingFunctionRegistry.get_instance().get("jina").create(name="jina-embeddings-v2-base-en") + + + class TextModel(LanceModel): + text: str = jina_embed.SourceField() + vector: Vector(jina_embed.ndims()) = jina_embed.VectorField() + + + data = [{"text": "hello world"}, + {"text": "goodbye world"}] + + db = lancedb.connect("~/.lancedb-2") + tbl = db.create_table("test", schema=TextModel, mode="overwrite") + + tbl.add(data) +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md new file mode 100644 index 00000000..3b8cfcce --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/ollama_embedding.md @@ -0,0 +1,37 @@ +# Ollama embeddings + +Generate embeddings via the [ollama](https://github.com/ollama/ollama-python) python library. More details: + +- [Ollama docs on embeddings](https://github.com/ollama/ollama/blob/main/docs/api.md#generate-embeddings) +- [Ollama blog on embeddings](https://ollama.com/blog/embedding-models) + +| Parameter | Type | Default Value | Description | +|------------------------|----------------------------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| `name` | `str` | `nomic-embed-text` | The name of the model. | +| `host` | `str` | `http://localhost:11434` | The Ollama host to connect to. | +| `options` | `ollama.Options` or `dict` | `None` | Additional model parameters listed in the documentation for the Modelfile such as `temperature`. | +| `keep_alive` | `float` or `str` | `"5m"` | Controls how long the model will stay loaded into memory following the request. | +| `ollama_client_kwargs` | `dict` | `{}` | kwargs that can be past to the `ollama.Client`. | + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect("/tmp/db") +func = get_registry().get("ollama").create(name="nomic-embed-text") + +class Words(LanceModel): + text: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() + +table = db.create_table("words", schema=Words, mode="overwrite") +table.add([ + {"text": "hello world"}, + {"text": "goodbye world"} +]) + +query = "greetings" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md new file mode 100644 index 00000000..87fd28f1 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/openai_embedding.md @@ -0,0 +1,34 @@ +# OpenAI embeddings + +LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances: + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `"text-embedding-ada-002"` | The name of the model. | +| `dim` | `int` | Model default | For OpenAI's newer text-embedding-3 model, we can specify a dimensionality that is smaller than the 1536 size. This feature supports it | + + +```python +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry + +db = lancedb.connect("/tmp/db") +func = get_registry().get("openai").create(name="text-embedding-ada-002") + +class Words(LanceModel): + text: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() + +table = db.create_table("words", schema=Words, mode="overwrite") +table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] + ) + +query = "greetings" +actual = table.search(query).limit(1).to_pydantic(Words)[0] +print(actual.text) +``` \ No newline at end of file diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md new file mode 100644 index 00000000..1adff158 --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers.md @@ -0,0 +1,174 @@ +# Sentence transformers +Allows you to set parameters when registering a `sentence-transformers` object. + +!!! info + Sentence transformer embeddings are normalized by default. It is recommended to use normalized embeddings for similarity search. + +| Parameter | Type | Default Value | Description | +|---|---|---|---| +| `name` | `str` | `all-MiniLM-L6-v2` | The name of the model | +| `device` | `str` | `cpu` | The device to run the model on (can be `cpu` or `gpu`) | +| `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model | +| `trust_remote_code` | `bool` | `False` | Whether to trust and execute remote code from the model's Huggingface repository | + + +??? "Check out available sentence-transformer models here!" + ```markdown + - sentence-transformers/all-MiniLM-L12-v2 + - sentence-transformers/paraphrase-mpnet-base-v2 + - sentence-transformers/gtr-t5-base + - sentence-transformers/LaBSE + - sentence-transformers/all-MiniLM-L6-v2 + - sentence-transformers/bert-base-nli-max-tokens + - sentence-transformers/bert-base-nli-mean-tokens + - sentence-transformers/bert-base-nli-stsb-mean-tokens + - sentence-transformers/bert-base-wikipedia-sections-mean-tokens + - sentence-transformers/bert-large-nli-cls-token + - sentence-transformers/bert-large-nli-max-tokens + - sentence-transformers/bert-large-nli-mean-tokens + - sentence-transformers/bert-large-nli-stsb-mean-tokens + - sentence-transformers/distilbert-base-nli-max-tokens + - sentence-transformers/distilbert-base-nli-mean-tokens + - sentence-transformers/distilbert-base-nli-stsb-mean-tokens + - sentence-transformers/distilroberta-base-msmarco-v1 + - sentence-transformers/distilroberta-base-msmarco-v2 + - sentence-transformers/nli-bert-base-cls-pooling + - sentence-transformers/nli-bert-base-max-pooling + - sentence-transformers/nli-bert-base + - sentence-transformers/nli-bert-large-cls-pooling + - sentence-transformers/nli-bert-large-max-pooling + - sentence-transformers/nli-bert-large + - sentence-transformers/nli-distilbert-base-max-pooling + - sentence-transformers/nli-distilbert-base + - sentence-transformers/nli-roberta-base + - sentence-transformers/nli-roberta-large + - sentence-transformers/roberta-base-nli-mean-tokens + - sentence-transformers/roberta-base-nli-stsb-mean-tokens + - sentence-transformers/roberta-large-nli-mean-tokens + - sentence-transformers/roberta-large-nli-stsb-mean-tokens + - sentence-transformers/stsb-bert-base + - sentence-transformers/stsb-bert-large + - sentence-transformers/stsb-distilbert-base + - sentence-transformers/stsb-roberta-base + - sentence-transformers/stsb-roberta-large + - sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens + - sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens + - sentence-transformers/xlm-r-base-en-ko-nli-ststb + - sentence-transformers/xlm-r-bert-base-nli-mean-tokens + - sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens + - sentence-transformers/xlm-r-large-en-ko-nli-ststb + - sentence-transformers/bert-base-nli-cls-token + - sentence-transformers/all-distilroberta-v1 + - sentence-transformers/multi-qa-MiniLM-L6-dot-v1 + - sentence-transformers/multi-qa-distilbert-cos-v1 + - sentence-transformers/multi-qa-distilbert-dot-v1 + - sentence-transformers/multi-qa-mpnet-base-cos-v1 + - sentence-transformers/multi-qa-mpnet-base-dot-v1 + - sentence-transformers/nli-distilroberta-base-v2 + - sentence-transformers/all-MiniLM-L6-v1 + - sentence-transformers/all-mpnet-base-v1 + - sentence-transformers/all-mpnet-base-v2 + - sentence-transformers/all-roberta-large-v1 + - sentence-transformers/allenai-specter + - sentence-transformers/average_word_embeddings_glove.6B.300d + - sentence-transformers/average_word_embeddings_glove.840B.300d + - sentence-transformers/average_word_embeddings_komninos + - sentence-transformers/average_word_embeddings_levy_dependency + - sentence-transformers/clip-ViT-B-32-multilingual-v1 + - sentence-transformers/clip-ViT-B-32 + - sentence-transformers/distilbert-base-nli-stsb-quora-ranking + - sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking + - sentence-transformers/distilroberta-base-paraphrase-v1 + - sentence-transformers/distiluse-base-multilingual-cased-v1 + - sentence-transformers/distiluse-base-multilingual-cased-v2 + - sentence-transformers/distiluse-base-multilingual-cased + - sentence-transformers/facebook-dpr-ctx_encoder-multiset-base + - sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base + - sentence-transformers/facebook-dpr-question_encoder-multiset-base + - sentence-transformers/facebook-dpr-question_encoder-single-nq-base + - sentence-transformers/gtr-t5-large + - sentence-transformers/gtr-t5-xl + - sentence-transformers/gtr-t5-xxl + - sentence-transformers/msmarco-MiniLM-L-12-v3 + - sentence-transformers/msmarco-MiniLM-L-6-v3 + - sentence-transformers/msmarco-MiniLM-L12-cos-v5 + - sentence-transformers/msmarco-MiniLM-L6-cos-v5 + - sentence-transformers/msmarco-bert-base-dot-v5 + - sentence-transformers/msmarco-bert-co-condensor + - sentence-transformers/msmarco-distilbert-base-dot-prod-v3 + - sentence-transformers/msmarco-distilbert-base-tas-b + - sentence-transformers/msmarco-distilbert-base-v2 + - sentence-transformers/msmarco-distilbert-base-v3 + - sentence-transformers/msmarco-distilbert-base-v4 + - sentence-transformers/msmarco-distilbert-cos-v5 + - sentence-transformers/msmarco-distilbert-dot-v5 + - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned + - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch + - sentence-transformers/msmarco-distilroberta-base-v2 + - sentence-transformers/msmarco-roberta-base-ance-firstp + - sentence-transformers/msmarco-roberta-base-v2 + - sentence-transformers/msmarco-roberta-base-v3 + - sentence-transformers/multi-qa-MiniLM-L6-cos-v1 + - sentence-transformers/nli-mpnet-base-v2 + - sentence-transformers/nli-roberta-base-v2 + - sentence-transformers/nq-distilbert-base-v1 + - sentence-transformers/paraphrase-MiniLM-L12-v2 + - sentence-transformers/paraphrase-MiniLM-L3-v2 + - sentence-transformers/paraphrase-MiniLM-L6-v2 + - sentence-transformers/paraphrase-TinyBERT-L6-v2 + - sentence-transformers/paraphrase-albert-base-v2 + - sentence-transformers/paraphrase-albert-small-v2 + - sentence-transformers/paraphrase-distilroberta-base-v1 + - sentence-transformers/paraphrase-distilroberta-base-v2 + - sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 + - sentence-transformers/paraphrase-multilingual-mpnet-base-v2 + - sentence-transformers/paraphrase-xlm-r-multilingual-v1 + - sentence-transformers/quora-distilbert-base + - sentence-transformers/quora-distilbert-multilingual + - sentence-transformers/sentence-t5-base + - sentence-transformers/sentence-t5-large + - sentence-transformers/sentence-t5-xxl + - sentence-transformers/sentence-t5-xl + - sentence-transformers/stsb-distilroberta-base-v2 + - sentence-transformers/stsb-mpnet-base-v2 + - sentence-transformers/stsb-roberta-base-v2 + - sentence-transformers/stsb-xlm-r-multilingual + - sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1 + - sentence-transformers/clip-ViT-L-14 + - sentence-transformers/clip-ViT-B-16 + - sentence-transformers/use-cmlm-multilingual + - sentence-transformers/all-MiniLM-L12-v1 + ``` + +!!! info + You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc. + See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers). + +!!! note "BAAI Embeddings example" + Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers) + ```python + import lancedb + from lancedb.pydantic import LanceModel, Vector + from lancedb.embeddings import get_registry + + db = lancedb.connect("/tmp/db") + model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") + + class Words(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + + table = db.create_table("words", schema=Words) + table.add( + [ + {"text": "hello world"}, + {"text": "goodbye world"} + ] + ) + + query = "greetings" + actual = table.search(query).limit(1).to_pydantic(Words)[0] + print(actual.text) + ``` +Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models. + diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md index 95122437..ced97048 100644 --- a/docs/src/embeddings/default_embedding_functions.md +++ b/docs/src/embeddings/default_embedding_functions.md @@ -6,795 +6,25 @@ Contains the text embedding functions registered by default. * Embedding functions have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with exponential backoff. * Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7. -### Sentence transformers -Allows you to set parameters when registering a `sentence-transformers` object. +**Available Text Embeddings**: + +- [Sentence Transformers](available_embedding_models/text_embedding_functions/sentence_transformers.md) +- [Huggingface Embedding Models](available_embedding_models/text_embedding_functions/huggingface_embedding.md) +- [Ollama Embeddings](available_embedding_models/text_embedding_functions/ollama_embedding.md) +- [OpenAI Embeddings](available_embedding_models/text_embedding_functions/openai_embedding.md) +- [Instructor Embeddings](available_embedding_models/text_embedding_functions/instructor_embedding.md) +- [Gemini Embeddings](available_embedding_models/text_embedding_functions/gemini_embedding.md) +- [Cohere Embeddings](available_embedding_models/text_embedding_functions/cohere_embedding.md) +- [Jina Embeddings](available_embedding_models/text_embedding_functions/jina_embedding.md) +- [AWS Bedrock Text Embedding Functions](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) +- [IBM Watsonx.ai Embeddings](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) -!!! info - Sentence transformer embeddings are normalized by default. It is recommended to use normalized embeddings for similarity search. - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `all-MiniLM-L6-v2` | The name of the model | -| `device` | `str` | `cpu` | The device to run the model on (can be `cpu` or `gpu`) | -| `normalize` | `bool` | `True` | Whether to normalize the input text before feeding it to the model | -| `trust_remote_code` | `bool` | `False` | Whether to trust and execute remote code from the model's Huggingface repository | - - -??? "Check out available sentence-transformer models here!" - ```markdown - - sentence-transformers/all-MiniLM-L12-v2 - - sentence-transformers/paraphrase-mpnet-base-v2 - - sentence-transformers/gtr-t5-base - - sentence-transformers/LaBSE - - sentence-transformers/all-MiniLM-L6-v2 - - sentence-transformers/bert-base-nli-max-tokens - - sentence-transformers/bert-base-nli-mean-tokens - - sentence-transformers/bert-base-nli-stsb-mean-tokens - - sentence-transformers/bert-base-wikipedia-sections-mean-tokens - - sentence-transformers/bert-large-nli-cls-token - - sentence-transformers/bert-large-nli-max-tokens - - sentence-transformers/bert-large-nli-mean-tokens - - sentence-transformers/bert-large-nli-stsb-mean-tokens - - sentence-transformers/distilbert-base-nli-max-tokens - - sentence-transformers/distilbert-base-nli-mean-tokens - - sentence-transformers/distilbert-base-nli-stsb-mean-tokens - - sentence-transformers/distilroberta-base-msmarco-v1 - - sentence-transformers/distilroberta-base-msmarco-v2 - - sentence-transformers/nli-bert-base-cls-pooling - - sentence-transformers/nli-bert-base-max-pooling - - sentence-transformers/nli-bert-base - - sentence-transformers/nli-bert-large-cls-pooling - - sentence-transformers/nli-bert-large-max-pooling - - sentence-transformers/nli-bert-large - - sentence-transformers/nli-distilbert-base-max-pooling - - sentence-transformers/nli-distilbert-base - - sentence-transformers/nli-roberta-base - - sentence-transformers/nli-roberta-large - - sentence-transformers/roberta-base-nli-mean-tokens - - sentence-transformers/roberta-base-nli-stsb-mean-tokens - - sentence-transformers/roberta-large-nli-mean-tokens - - sentence-transformers/roberta-large-nli-stsb-mean-tokens - - sentence-transformers/stsb-bert-base - - sentence-transformers/stsb-bert-large - - sentence-transformers/stsb-distilbert-base - - sentence-transformers/stsb-roberta-base - - sentence-transformers/stsb-roberta-large - - sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens - - sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens - - sentence-transformers/xlm-r-base-en-ko-nli-ststb - - sentence-transformers/xlm-r-bert-base-nli-mean-tokens - - sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens - - sentence-transformers/xlm-r-large-en-ko-nli-ststb - - sentence-transformers/bert-base-nli-cls-token - - sentence-transformers/all-distilroberta-v1 - - sentence-transformers/multi-qa-MiniLM-L6-dot-v1 - - sentence-transformers/multi-qa-distilbert-cos-v1 - - sentence-transformers/multi-qa-distilbert-dot-v1 - - sentence-transformers/multi-qa-mpnet-base-cos-v1 - - sentence-transformers/multi-qa-mpnet-base-dot-v1 - - sentence-transformers/nli-distilroberta-base-v2 - - sentence-transformers/all-MiniLM-L6-v1 - - sentence-transformers/all-mpnet-base-v1 - - sentence-transformers/all-mpnet-base-v2 - - sentence-transformers/all-roberta-large-v1 - - sentence-transformers/allenai-specter - - sentence-transformers/average_word_embeddings_glove.6B.300d - - sentence-transformers/average_word_embeddings_glove.840B.300d - - sentence-transformers/average_word_embeddings_komninos - - sentence-transformers/average_word_embeddings_levy_dependency - - sentence-transformers/clip-ViT-B-32-multilingual-v1 - - sentence-transformers/clip-ViT-B-32 - - sentence-transformers/distilbert-base-nli-stsb-quora-ranking - - sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking - - sentence-transformers/distilroberta-base-paraphrase-v1 - - sentence-transformers/distiluse-base-multilingual-cased-v1 - - sentence-transformers/distiluse-base-multilingual-cased-v2 - - sentence-transformers/distiluse-base-multilingual-cased - - sentence-transformers/facebook-dpr-ctx_encoder-multiset-base - - sentence-transformers/facebook-dpr-ctx_encoder-single-nq-base - - sentence-transformers/facebook-dpr-question_encoder-multiset-base - - sentence-transformers/facebook-dpr-question_encoder-single-nq-base - - sentence-transformers/gtr-t5-large - - sentence-transformers/gtr-t5-xl - - sentence-transformers/gtr-t5-xxl - - sentence-transformers/msmarco-MiniLM-L-12-v3 - - sentence-transformers/msmarco-MiniLM-L-6-v3 - - sentence-transformers/msmarco-MiniLM-L12-cos-v5 - - sentence-transformers/msmarco-MiniLM-L6-cos-v5 - - sentence-transformers/msmarco-bert-base-dot-v5 - - sentence-transformers/msmarco-bert-co-condensor - - sentence-transformers/msmarco-distilbert-base-dot-prod-v3 - - sentence-transformers/msmarco-distilbert-base-tas-b - - sentence-transformers/msmarco-distilbert-base-v2 - - sentence-transformers/msmarco-distilbert-base-v3 - - sentence-transformers/msmarco-distilbert-base-v4 - - sentence-transformers/msmarco-distilbert-cos-v5 - - sentence-transformers/msmarco-distilbert-dot-v5 - - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-lng-aligned - - sentence-transformers/msmarco-distilbert-multilingual-en-de-v2-tmp-trained-scratch - - sentence-transformers/msmarco-distilroberta-base-v2 - - sentence-transformers/msmarco-roberta-base-ance-firstp - - sentence-transformers/msmarco-roberta-base-v2 - - sentence-transformers/msmarco-roberta-base-v3 - - sentence-transformers/multi-qa-MiniLM-L6-cos-v1 - - sentence-transformers/nli-mpnet-base-v2 - - sentence-transformers/nli-roberta-base-v2 - - sentence-transformers/nq-distilbert-base-v1 - - sentence-transformers/paraphrase-MiniLM-L12-v2 - - sentence-transformers/paraphrase-MiniLM-L3-v2 - - sentence-transformers/paraphrase-MiniLM-L6-v2 - - sentence-transformers/paraphrase-TinyBERT-L6-v2 - - sentence-transformers/paraphrase-albert-base-v2 - - sentence-transformers/paraphrase-albert-small-v2 - - sentence-transformers/paraphrase-distilroberta-base-v1 - - sentence-transformers/paraphrase-distilroberta-base-v2 - - sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 - - sentence-transformers/paraphrase-multilingual-mpnet-base-v2 - - sentence-transformers/paraphrase-xlm-r-multilingual-v1 - - sentence-transformers/quora-distilbert-base - - sentence-transformers/quora-distilbert-multilingual - - sentence-transformers/sentence-t5-base - - sentence-transformers/sentence-t5-large - - sentence-transformers/sentence-t5-xxl - - sentence-transformers/sentence-t5-xl - - sentence-transformers/stsb-distilroberta-base-v2 - - sentence-transformers/stsb-mpnet-base-v2 - - sentence-transformers/stsb-roberta-base-v2 - - sentence-transformers/stsb-xlm-r-multilingual - - sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1 - - sentence-transformers/clip-ViT-L-14 - - sentence-transformers/clip-ViT-B-16 - - sentence-transformers/use-cmlm-multilingual - - sentence-transformers/all-MiniLM-L12-v1 - ``` - -!!! info - You can also load many other model architectures from the library. For example models from sources such as BAAI, nomic, salesforce research, etc. - See this HF hub page for all [supported models](https://huggingface.co/models?library=sentence-transformers). - -!!! note "BAAI Embeddings example" - Here is an example that uses BAAI embedding model from the HuggingFace Hub [supported models](https://huggingface.co/models?library=sentence-transformers) - ```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - - db = lancedb.connect("/tmp/db") - model = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5", device="cpu") - - class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - - table = db.create_table("words", schema=Words) - table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - - query = "greetings" - actual = table.search(query).limit(1).to_pydantic(Words)[0] - print(actual.text) - ``` -Visit sentence-transformers [HuggingFace HUB](https://huggingface.co/sentence-transformers) page for more information on the available models. - - -### Huggingface embedding models -We offer support for all huggingface models (which can be loaded via [transformers](https://huggingface.co/docs/transformers/en/index) library). The default model is `colbert-ir/colbertv2.0` which also has its own special callout - `registry.get("colbert")` - -Example usage - -```python -import lancedb -import pandas as pd - -from lancedb.embeddings import get_registry -from lancedb.pydantic import LanceModel, Vector - -model = get_registry().get("huggingface").create(name='facebook/bart-base') - -class Words(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hi hello sayonara", "goodbye world"]}) -table = db.create_table("greets", schema=Words) -table.add(df) -query = "old greeting" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` - - -### Ollama embeddings -Generate embeddings via the [ollama](https://github.com/ollama/ollama-python) python library. More details: - -- [Ollama docs on embeddings](https://github.com/ollama/ollama/blob/main/docs/api.md#generate-embeddings) -- [Ollama blog on embeddings](https://ollama.com/blog/embedding-models) - -| Parameter | Type | Default Value | Description | -|------------------------|----------------------------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| -| `name` | `str` | `nomic-embed-text` | The name of the model. | -| `host` | `str` | `http://localhost:11434` | The Ollama host to connect to. | -| `options` | `ollama.Options` or `dict` | `None` | Additional model parameters listed in the documentation for the Modelfile such as `temperature`. | -| `keep_alive` | `float` or `str` | `"5m"` | Controls how long the model will stay loaded into memory following the request. | -| `ollama_client_kwargs` | `dict` | `{}` | kwargs that can be past to the `ollama.Client`. | - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -func = get_registry().get("ollama").create(name="nomic-embed-text") - -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - -table = db.create_table("words", schema=Words, mode="overwrite") -table.add([ - {"text": "hello world"}, - {"text": "goodbye world"} -]) - -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` - - -### OpenAI embeddings -LanceDB registers the OpenAI embeddings function in the registry by default, as `openai`. Below are the parameters that you can customize when creating the instances: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"text-embedding-ada-002"` | The name of the model. | -| `dim` | `int` | Model default | For OpenAI's newer text-embedding-3 model, we can specify a dimensionality that is smaller than the 1536 size. This feature supports it | - - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect("/tmp/db") -func = get_registry().get("openai").create(name="text-embedding-ada-002") - -class Words(LanceModel): - text: str = func.SourceField() - vector: Vector(func.ndims()) = func.VectorField() - -table = db.create_table("words", schema=Words, mode="overwrite") -table.add( - [ - {"text": "hello world"}, - {"text": "goodbye world"} - ] - ) - -query = "greetings" -actual = table.search(query).limit(1).to_pydantic(Words)[0] -print(actual.text) -``` - -### Instructor Embeddings -[Instructor](https://instructor-embedding.github.io/) is an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g. classification, retrieval, clustering, text evaluation, etc.) and domains (e.g. science, finance, etc.) by simply providing the task instruction, without any finetuning. - -If you want to calculate customized embeddings for specific sentences, you can follow the unified template to write instructions. - -!!! info - Represent the `domain` `text_type` for `task_objective`: - - * `domain` is optional, and it specifies the domain of the text, e.g. science, finance, medicine, etc. - * `text_type` is required, and it specifies the encoding unit, e.g. sentence, document, paragraph, etc. - * `task_objective` is optional, and it specifies the objective of embedding, e.g. retrieve a document, classify the sentence, etc. - -More information about the model can be found at the [source URL](https://github.com/xlang-ai/instructor-embedding). - -| Argument | Type | Default | Description | -|---|---|---|---| -| `name` | `str` | "hkunlp/instructor-base" | The name of the model to use | -| `batch_size` | `int` | `32` | The batch size to use when generating embeddings | -| `device` | `str` | `"cpu"` | The device to use when generating embeddings | -| `show_progress_bar` | `bool` | `True` | Whether to show a progress bar when generating embeddings | -| `normalize_embeddings` | `bool` | `True` | Whether to normalize the embeddings | -| `quantize` | `bool` | `False` | Whether to quantize the model | -| `source_instruction` | `str` | `"represent the docuement for retreival"` | The instruction for the source column | -| `query_instruction` | `str` | `"represent the document for retreiving the most similar documents"` | The instruction for the query | - - - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry, InstuctorEmbeddingFunction - -instructor = get_registry().get("instructor").create( - source_instruction="represent the docuement for retreival", - query_instruction="represent the document for retreiving the most similar documents" - ) - -class Schema(LanceModel): - vector: Vector(instructor.ndims()) = instructor.VectorField() - text: str = instructor.SourceField() - -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("test", schema=Schema, mode="overwrite") - -texts = [{"text": "Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that..."}, - {"text": "The disparate impact theory is especially controversial under the Fair Housing Act because the Act..."}, - {"text": "Disparate impact in United States labor law refers to practices in employment, housing, and other areas that.."}] - -tbl.add(texts) -``` - -### Gemini Embeddings -With Google's Gemini, you can represent text (words, sentences, and blocks of text) in a vectorized form, making it easier to compare and contrast embeddings. For example, two texts that share a similar subject matter or sentiment should have similar embeddings, which can be identified through mathematical comparison techniques such as cosine similarity. For more on how and why you should use embeddings, refer to the Embeddings guide. -The Gemini Embedding Model API supports various task types: - -| Task Type | Description | -|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| -| "`retrieval_query`" | Specifies the given text is a query in a search/retrieval setting. | -| "`retrieval_document`" | Specifies the given text is a document in a search/retrieval setting. Using this task type requires a title but is automatically proided by Embeddings API | -| "`semantic_similarity`" | Specifies the given text will be used for Semantic Textual Similarity (STS). | -| "`classification`" | Specifies that the embeddings will be used for classification. | -| "`clusering`" | Specifies that the embeddings will be used for clustering. | - - -Usage Example: - -```python -import lancedb -import pandas as pd -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - - -model = get_registry().get("gemini-text").create() - -class TextModel(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("test", schema=TextModel, mode="overwrite") - -tbl.add(df) -rs = tbl.search("hello").limit(1).to_pandas() -``` - -### Cohere Embeddings -Using cohere API requires cohere package, which can be installed using `pip install cohere`. Cohere embeddings are used to generate embeddings for text data. The embeddings can be used for various tasks like semantic search, clustering, and classification. -You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API. - -Supported models are: -``` - * embed-english-v3.0 - * embed-multilingual-v3.0 - * embed-english-light-v3.0 - * embed-multilingual-light-v3.0 - * embed-english-v2.0 - * embed-english-light-v2.0 - * embed-multilingual-v2.0 -``` - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"embed-english-v2.0"` | The model ID of the cohere model to use. Supported base models for Text Embeddings: embed-english-v3.0, embed-multilingual-v3.0, embed-english-light-v3.0, embed-multilingual-light-v3.0, embed-english-v2.0, embed-english-light-v2.0, embed-multilingual-v2.0 | -| `source_input_type` | `str` | `"search_document"` | The type of input data to be used for the source column. | -| `query_input_type` | `str` | `"search_query"` | The type of input data to be used for the query. | - -Cohere supports following input types: - -| Input Type | Description | -|-------------------------|---------------------------------------| -| "`search_document`" | Used for embeddings stored in a vector| -| | database for search use-cases. | -| "`search_query`" | Used for embeddings of search queries | -| | run against a vector DB | -| "`semantic_similarity`" | Specifies the given text will be used | -| | for Semantic Textual Similarity (STS) | -| "`classification`" | Used for embeddings passed through a | -| | text classifier. | -| "`clustering`" | Used for the embeddings run through a | -| | clustering algorithm | - -Usage Example: - - ```python - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import EmbeddingFunctionRegistry - - cohere = EmbeddingFunctionRegistry - .get_instance() - .get("cohere") - .create(name="embed-multilingual-v2.0") - - class TextModel(LanceModel): - text: str = cohere.SourceField() - vector: Vector(cohere.ndims()) = cohere.VectorField() - - data = [ { "text": "hello world" }, - { "text": "goodbye world" }] - - db = lancedb.connect("~/.lancedb") - tbl = db.create_table("test", schema=TextModel, mode="overwrite") - - tbl.add(data) - ``` - -### Jina Embeddings -Jina embeddings are used to generate embeddings for text and image data. -You also need to set the `JINA_API_KEY` environment variable to use the Jina API. - -You can find a list of supported models under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | - -Usage Example: - -```python - import os - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import EmbeddingFunctionRegistry - - os.environ['JINA_API_KEY'] = 'jina_*' - - jina_embed = EmbeddingFunctionRegistry.get_instance().get("jina").create(name="jina-embeddings-v2-base-en") - - - class TextModel(LanceModel): - text: str = jina_embed.SourceField() - vector: Vector(jina_embed.ndims()) = jina_embed.VectorField() - - - data = [{"text": "hello world"}, - {"text": "goodbye world"}] - - db = lancedb.connect("~/.lancedb-2") - tbl = db.create_table("test", schema=TextModel, mode="overwrite") - - tbl.add(data) -``` - -### AWS Bedrock Text Embedding Functions -AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. -You can do so by using `awscli` and also add your session_token: -```shell -aws configure -aws configure set aws_session_token "" -``` -to ensure that the credentials are set up correctly, you can run the following command: -```shell -aws sts get-caller-identity -``` - -Supported Embedding modelIDs are: -* `amazon.titan-embed-text-v1` -* `cohere.embed-english-v3` -* `cohere.embed-multilingual-v3` - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| **name** | str | "amazon.titan-embed-text-v1" | The model ID of the bedrock model to use. Supported base models for Text Embeddings: amazon.titan-embed-text-v1, cohere.embed-english-v3, cohere.embed-multilingual-v3 | -| **region** | str | "us-east-1" | Optional name of the AWS Region in which the service should be called (e.g., "us-east-1"). | -| **profile_name** | str | None | Optional name of the AWS profile to use for calling the Bedrock service. If not specified, the default profile will be used. | -| **assumed_role** | str | None | Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not specified, the current active credentials will be used. | -| **role_session_name** | str | "lancedb-embeddings" | Optional name of the AWS IAM role session to use for calling the Bedrock service. If not specified, a "lancedb-embeddings" name will be used. | -| **runtime** | bool | True | Optional choice of getting different client to perform operations with the Amazon Bedrock service. | -| **max_retries** | int | 7 | Optional number of retries to perform when a request fails. | - -Usage Example: - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -model = get_registry().get("bedrock-text").create() - -class TextModel(LanceModel): - text: str = model.SourceField() - vector: Vector(model.ndims()) = model.VectorField() - -df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) -db = lancedb.connect("tmp_path") -tbl = db.create_table("test", schema=TextModel, mode="overwrite") - -tbl.add(df) -rs = tbl.search("hello").limit(1).to_pandas() -``` - -# IBM watsonx.ai Embeddings - -Generate text embeddings using IBM's watsonx.ai platform. - -## Supported Models - -You can find a list of supported models at [IBM watsonx.ai Documentation](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx). The currently supported model names are: - -- `ibm/slate-125m-english-rtrvr` -- `ibm/slate-30m-english-rtrvr` -- `sentence-transformers/all-minilm-l12-v2` -- `intfloat/multilingual-e5-large` - -## Parameters - -The following parameters can be passed to the `create` method: - -| Parameter | Type | Default Value | Description | -|------------|----------|----------------------------------|-----------------------------------------------------------| -| name | str | "ibm/slate-125m-english-rtrvr" | The model ID of the watsonx.ai model to use | -| api_key | str | None | Optional IBM Cloud API key (or set `WATSONX_API_KEY`) | -| project_id | str | None | Optional watsonx project ID (or set `WATSONX_PROJECT_ID`) | -| url | str | None | Optional custom URL for the watsonx.ai instance | -| params | dict | None | Optional additional parameters for the embedding model | - -## Usage Example - -First, the watsonx.ai library is an optional dependency, so must be installed seperately: - -``` -pip install ibm-watsonx-ai -``` - -Optionally set environment variables (if not passing credentials to `create` directly): - -```sh -export WATSONX_API_KEY="YOUR_WATSONX_API_KEY" -export WATSONX_PROJECT_ID="YOUR_WATSONX_PROJECT_ID" -``` - -```python -import os -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import EmbeddingFunctionRegistry - -watsonx_embed = EmbeddingFunctionRegistry - .get_instance() - .get("watsonx") - .create( - name="ibm/slate-125m-english-rtrvr", - # Uncomment and set these if not using environment variables - # api_key="your_api_key_here", - # project_id="your_project_id_here", - # url="your_watsonx_url_here", - # params={...}, - ) - -class TextModel(LanceModel): - text: str = watsonx_embed.SourceField() - vector: Vector(watsonx_embed.ndims()) = watsonx_embed.VectorField() - -data = [ - {"text": "hello world"}, - {"text": "goodbye world"}, -] - -db = lancedb.connect("~/.lancedb") -tbl = db.create_table("watsonx_test", schema=TextModel, mode="overwrite") - -tbl.add(data) - -rs = tbl.search("hello").limit(1).to_pandas() -print(rs) -``` ## Multi-modal embedding functions Multi-modal embedding functions allow you to query your table using both images and text. -### OpenClip embeddings -We support CLIP model embeddings using the open source alternative, [open-clip](https://github.com/mlfoundations/open_clip) which supports various customizations. It is registered as `open-clip` and supports the following customizations: +**Available Multi-modal Embeddings** : -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"ViT-B-32"` | The name of the model. | -| `pretrained` | `str` | `"laion2b_s34b_b79k"` | The name of the pretrained model to load. | -| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | -| `batch_size` | `int` | `64` | The number of images to process in a batch. | -| `normalize` | `bool` | `True` | Whether to normalize the input images before feeding them to the model. | - -This embedding function supports ingesting images as both bytes and urls. You can query them using both test and other images. - -!!! info - LanceDB supports ingesting images directly from accessible links. - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect(tmp_path) -func = get_registry.get("open-clip").create() - -class Images(LanceModel): - label: str - image_uri: str = func.SourceField() # image uri as the source - image_bytes: bytes = func.SourceField() # image bytes as the source - vector: Vector(func.ndims()) = func.VectorField() # vector column - vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column - -table = db.create_table("images", schema=Images) -labels = ["cat", "cat", "dog", "dog", "horse", "horse"] -uris = [ - "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", - "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", - "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", - "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", - "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", - "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", -] -# get each uri as bytes -image_bytes = [requests.get(uri).content for uri in uris] -table.add( - pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) -) -``` -Now we can search using text from both the default vector column and the custom vector column -```python - -# text search -actual = table.search("man's best friend").limit(1).to_pydantic(Images)[0] -print(actual.label) # prints "dog" - -frombytes = ( - table.search("man's best friend", vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(frombytes.label) - -``` - -Because we're using a multi-modal embedding function, we can also search using images - -```python -# image search -query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg" -image_bytes = requests.get(query_image_uri).content -query_image = Image.open(io.BytesIO(image_bytes)) -actual = table.search(query_image).limit(1).to_pydantic(Images)[0] -print(actual.label == "dog") - -# image search using a custom vector column -other = ( - table.search(query_image, vector_column_name="vec_from_bytes") - .limit(1) - .to_pydantic(Images)[0] -) -print(actual.label) - -``` - -### Imagebind embeddings -We have support for [imagebind](https://github.com/facebookresearch/ImageBind) model embeddings. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. - -This function is registered as `imagebind` and supports Audio, Video and Text modalities(extending to Thermal,Depth,IMU data): - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"imagebind_huge"` | Name of the model. | -| `device` | `str` | `"cpu"` | The device to run the model on. Can be `"cpu"` or `"gpu"`. | -| `normalize` | `bool` | `False` | set to `True` to normalize your inputs before model ingestion. | - -Below is an example demonstrating how the API works: - -```python -import lancedb -from lancedb.pydantic import LanceModel, Vector -from lancedb.embeddings import get_registry - -db = lancedb.connect(tmp_path) -func = get_registry.get("imagebind").create() - -class ImageBindModel(LanceModel): - text: str - image_uri: str = func.SourceField() - audio_path: str - vector: Vector(func.ndims()) = func.VectorField() - -# add locally accessible image paths -text_list=["A dog.", "A car", "A bird"] -image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"] -audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"] - -# Load data -inputs = [ - {"text": a, "audio_path": b, "image_uri": c} - for a, b, c in zip(text_list, audio_paths, image_paths) -] - -#create table and add data -table = db.create_table("img_bind", schema=ImageBindModel) -table.add(inputs) -``` - -Now, we can search using any modality: - -#### image search -```python -query_image = "./assets/dog_image2.jpg" #download an image and enter that path here -actual = table.search(query_image).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "dog") -``` -#### audio search - -```python -query_audio = "./assets/car_audio2.wav" #download an audio clip and enter path here -actual = table.search(query_audio).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "car") -``` -#### Text search -You can add any input query and fetch the result as follows: -```python -query = "an animal which flies and tweets" -actual = table.search(query).limit(1).to_pydantic(ImageBindModel)[0] -print(actual.text == "bird") -``` - -If you have any questions about the embeddings API, supported models, or see a relevant model missing, please raise an issue [on GitHub](https://github.com/lancedb/lancedb/issues). - -### Jina Embeddings -Jina embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list -under [https://jina.ai/embeddings/](https://jina.ai/embeddings/) - -Supported parameters (to be passed in `create` method) are: - -| Parameter | Type | Default Value | Description | -|---|---|---|---| -| `name` | `str` | `"jina-clip-v1"` | The model ID of the jina model to use | - -Usage Example: - -```python - import os - import requests - import lancedb - from lancedb.pydantic import LanceModel, Vector - from lancedb.embeddings import get_registry - import pandas as pd - - os.environ['JINA_API_KEY'] = 'jina_*' - - db = lancedb.connect("~/.lancedb") - func = get_registry().get("jina").create() - - - class Images(LanceModel): - label: str - image_uri: str = func.SourceField() # image uri as the source - image_bytes: bytes = func.SourceField() # image bytes as the source - vector: Vector(func.ndims()) = func.VectorField() # vector column - vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column - - - table = db.create_table("images", schema=Images) - labels = ["cat", "cat", "dog", "dog", "horse", "horse"] - uris = [ - "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", - "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", - "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", - "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", - "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", - "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", - ] - # get each uri as bytes - image_bytes = [requests.get(uri).content for uri in uris] - table.add( - pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": image_bytes}) - ) -``` +- [OpenClip Embeddings](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) +- [Imagebind Embeddings](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md) +- [Jina Embeddings](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) \ No newline at end of file diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py index 7f2f3a99..1923829d 100755 --- a/docs/test/md_testing.py +++ b/docs/test/md_testing.py @@ -19,6 +19,8 @@ excluded_globs = [ "../src/hybrid_search/hybrid_search.md", "../src/reranking/*.md", "../src/guides/tuning_retrievers/*.md", + "../src/embeddings/available_embedding_models/text_embedding_functions/*.md", + "../src/embeddings/available_embedding_models/multimodal_embedding_functions/*.md" ] python_prefix = "py"