From 2adb10e6a899799488ffa4538f5e3fcc78add637 Mon Sep 17 00:00:00 2001 From: fzowl <160063452+fzowl@users.noreply.github.com> Date: Sat, 3 Jan 2026 00:14:52 +0100 Subject: [PATCH] feat: voyage-multimodal-3.5 (#2887) voyage-multimodal-3.5 support (text, image and video embeddings) --- .../voyageai_multimodal_embedding.md | 111 +++++++++++++++ python/python/lancedb/embeddings/voyageai.py | 83 ++++++++++-- python/python/tests/test_embeddings_slow.py | 127 ++++++++++++++++++ 3 files changed, 308 insertions(+), 13 deletions(-) create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md new file mode 100644 index 00000000..14141a8a --- /dev/null +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md @@ -0,0 +1,111 @@ +# VoyageAI Embeddings : Multimodal + +VoyageAI embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list +under [https://docs.voyageai.com/docs/multimodal-embeddings](https://docs.voyageai.com/docs/multimodal-embeddings) + +Supported multimodal models: + +- `voyage-multimodal-3` - 1024 dimensions (text + images) +- `voyage-multimodal-3.5` - Flexible dimensions (256, 512, 1024 default, 2048). Supports text, images, and video. + +### Video Support (voyage-multimodal-3.5) + +The `voyage-multimodal-3.5` model supports video input through: +- Video URLs (`.mp4`, `.webm`, `.mov`, `.avi`, `.mkv`, `.m4v`, `.gif`) +- Video file paths + +Constraints: Max 20MB video size. + +Supported parameters (to be passed in `create` method) are: + +| Parameter | Type | Default Value | Description | +|---|---|-------------------------|-------------------------------------------| +| `name` | `str` | `"voyage-multimodal-3"` | The model ID of the VoyageAI model to use | +| `output_dimension` | `int` | `None` | Output dimension for voyage-multimodal-3.5. Valid: 256, 512, 1024, 2048 | + +Usage Example: + +```python +import base64 +import os +from io import BytesIO + +import requests +import lancedb +from lancedb.pydantic import LanceModel, Vector +from lancedb.embeddings import get_registry +import pandas as pd + +os.environ['VOYAGE_API_KEY'] = 'YOUR_VOYAGE_API_KEY' + +db = lancedb.connect(".lancedb") +func = get_registry().get("voyageai").create(name="voyage-multimodal-3") + + +def image_to_base64(image_bytes: bytes): + buffered = BytesIO(image_bytes) + img_str = base64.b64encode(buffered.getvalue()) + return img_str.decode("utf-8") + + +class Images(LanceModel): + label: str + image_uri: str = func.SourceField() # image uri as the source + image_bytes: str = func.SourceField() # image bytes base64 encoded as the source + vector: Vector(func.ndims()) = func.VectorField() # vector column + vec_from_bytes: Vector(func.ndims()) = func.VectorField() # Another vector column + + +if "images" in db.table_names(): + db.drop_table("images") +table = db.create_table("images", schema=Images) +labels = ["cat", "cat", "dog", "dog", "horse", "horse"] +uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg", + "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg", + "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg", +] +# get each uri as bytes +images_bytes = [image_to_base64(requests.get(uri).content) for uri in uris] +table.add( + pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": images_bytes}) +) +``` +Now we can search using text from both the default vector column and the custom vector column +```python + +# text search +actual = table.search("man's best friend", "vec_from_bytes").limit(1).to_pydantic(Images)[0] +print(actual.label) # prints "dog" + +frombytes = ( + table.search("man's best friend", vector_column_name="vec_from_bytes") + .limit(1) + .to_pydantic(Images)[0] +) +print(frombytes.label) + +``` + +Because we're using a multi-modal embedding function, we can also search using images + +```python +# image search +query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg" +image_bytes = requests.get(query_image_uri).content +query_image = Image.open(BytesIO(image_bytes)) +actual = table.search(query_image, "vec_from_bytes").limit(1).to_pydantic(Images)[0] +print(actual.label == "dog") + +# image search using a custom vector column +other = ( + table.search(query_image, vector_column_name="vec_from_bytes") + .limit(1) + .to_pydantic(Images)[0] +) +print(actual.label) + +``` diff --git a/python/python/lancedb/embeddings/voyageai.py b/python/python/lancedb/embeddings/voyageai.py index fef71174..a34756fc 100644 --- a/python/python/lancedb/embeddings/voyageai.py +++ b/python/python/lancedb/embeddings/voyageai.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright The LanceDB Authors import base64 import os -from typing import ClassVar, TYPE_CHECKING, List, Union, Any, Generator +from typing import ClassVar, TYPE_CHECKING, List, Union, Any, Generator, Optional from pathlib import Path from urllib.parse import urlparse @@ -45,11 +45,29 @@ def is_valid_url(text): return False +VIDEO_EXTENSIONS = {".mp4", ".webm", ".mov", ".avi", ".mkv", ".m4v", ".gif"} + + +def is_video_url(url: str) -> bool: + """Check if URL points to a video file based on extension.""" + parsed = urlparse(url) + path = parsed.path.lower() + return any(path.endswith(ext) for ext in VIDEO_EXTENSIONS) + + +def is_video_path(path: Path) -> bool: + """Check if file path is a video file based on extension.""" + return path.suffix.lower() in VIDEO_EXTENSIONS + + def transform_input(input_data: Union[str, bytes, Path]): PIL = attempt_import_or_raise("PIL", "pillow") if isinstance(input_data, str): if is_valid_url(input_data): - content = {"type": "image_url", "image_url": input_data} + if is_video_url(input_data): + content = {"type": "video_url", "video_url": input_data} + else: + content = {"type": "image_url", "image_url": input_data} else: content = {"type": "text", "text": input_data} elif isinstance(input_data, PIL.Image.Image): @@ -70,14 +88,24 @@ def transform_input(input_data: Union[str, bytes, Path]): "image_base64": "data:image/jpeg;base64," + img_str, } elif isinstance(input_data, Path): - img = PIL.Image.open(input_data) - buffered = BytesIO() - img.save(buffered, format="JPEG") - img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") - content = { - "type": "image_base64", - "image_base64": "data:image/jpeg;base64," + img_str, - } + if is_video_path(input_data): + # Read video file and encode as base64 + with open(input_data, "rb") as f: + video_bytes = f.read() + video_str = base64.b64encode(video_bytes).decode("utf-8") + content = { + "type": "video_base64", + "video_base64": video_str, + } + else: + img = PIL.Image.open(input_data) + buffered = BytesIO() + img.save(buffered, format="JPEG") + img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") + content = { + "type": "image_base64", + "image_base64": "data:image/jpeg;base64," + img_str, + } else: raise ValueError("Each input should be either str, bytes, Path or Image.") @@ -91,6 +119,8 @@ def sanitize_multimodal_input(inputs: Union[TEXT, IMAGES]) -> List[Any]: PIL = attempt_import_or_raise("PIL", "pillow") if isinstance(inputs, (str, bytes, Path, PIL.Image.Image)): inputs = [inputs] + elif isinstance(inputs, list): + pass # Already a list, use as-is elif isinstance(inputs, pa.Array): inputs = inputs.to_pylist() elif isinstance(inputs, pa.ChunkedArray): @@ -143,11 +173,16 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): * voyage-3 * voyage-3-lite * voyage-multimodal-3 + * voyage-multimodal-3.5 * voyage-finance-2 * voyage-multilingual-2 * voyage-law-2 * voyage-code-2 + output_dimension: int, optional + The output dimension for models that support flexible dimensions. + Currently only voyage-multimodal-3.5 supports this feature. + Valid options: 256, 512, 1024 (default), 2048. Examples -------- @@ -175,7 +210,10 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): """ name: str + output_dimension: Optional[int] = None client: ClassVar = None + _FLEXIBLE_DIM_MODELS: ClassVar[list] = ["voyage-multimodal-3.5"] + _VALID_DIMENSIONS: ClassVar[list] = [256, 512, 1024, 2048] text_embedding_models: list = [ "voyage-3.5", "voyage-3.5-lite", @@ -186,7 +224,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): "voyage-law-2", "voyage-code-2", ] - multimodal_embedding_models: list = ["voyage-multimodal-3"] + multimodal_embedding_models: list = ["voyage-multimodal-3", "voyage-multimodal-3.5"] contextual_embedding_models: list = ["voyage-context-3"] def _is_multimodal_model(self, model_name: str): @@ -198,6 +236,17 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): return model_name in self.contextual_embedding_models or "context" in model_name def ndims(self): + # Handle flexible dimension models + if self.name in self._FLEXIBLE_DIM_MODELS: + if self.output_dimension is not None: + if self.output_dimension not in self._VALID_DIMENSIONS: + raise ValueError( + f"Invalid output_dimension {self.output_dimension} " + f"for {self.name}. Valid options: {self._VALID_DIMENSIONS}" + ) + return self.output_dimension + return 1024 # default dimension + if self.name == "voyage-3-lite": return 512 elif self.name == "voyage-code-2": @@ -211,12 +260,17 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): "voyage-finance-2", "voyage-multilingual-2", "voyage-law-2", - "voyage-multimodal-3", ]: return 1024 else: raise ValueError(f"Model {self.name} not supported") + def _get_multimodal_kwargs(self, **kwargs): + """Get kwargs for multimodal embed call, including output_dimension if set.""" + if self.name in self._FLEXIBLE_DIM_MODELS and self.output_dimension is not None: + kwargs["output_dimension"] = self.output_dimension + return kwargs + def compute_query_embeddings( self, query: Union[str, "PIL.Image.Image"], *args, **kwargs ) -> List[np.ndarray]: @@ -234,6 +288,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): """ client = VoyageAIEmbeddingFunction._get_client() if self._is_multimodal_model(self.name): + kwargs = self._get_multimodal_kwargs(**kwargs) result = client.multimodal_embed( inputs=[[query]], model=self.name, input_type="query", **kwargs ) @@ -275,6 +330,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): ) if has_images: # Use non-batched API for images + kwargs = self._get_multimodal_kwargs(**kwargs) result = client.multimodal_embed( inputs=sanitized, model=self.name, input_type="document", **kwargs ) @@ -357,6 +413,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): callable: A function that takes a batch of texts and returns embeddings. """ if self._is_multimodal_model(self.name): + multimodal_kwargs = self._get_multimodal_kwargs(**kwargs) def embed_batch(batch: List[str]) -> List[np.array]: batch_inputs = sanitize_multimodal_input(batch) @@ -364,7 +421,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction): inputs=batch_inputs, model=self.name, input_type=input_type, - **kwargs, + **multimodal_kwargs, ) return result.embeddings diff --git a/python/python/tests/test_embeddings_slow.py b/python/python/tests/test_embeddings_slow.py index 966ac1c6..e58d2daa 100644 --- a/python/python/tests/test_embeddings_slow.py +++ b/python/python/tests/test_embeddings_slow.py @@ -613,6 +613,133 @@ def test_voyageai_multimodal_embedding_text_function(): assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims() +@pytest.mark.slow +@pytest.mark.skipif( + os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set" +) +def test_voyageai_multimodal_35_embedding_function(): + """Test voyage-multimodal-3.5 model with text input.""" + voyageai = ( + get_registry() + .get("voyageai") + .create(name="voyage-multimodal-3.5", max_retries=0) + ) + + class TextModel(LanceModel): + text: str = voyageai.SourceField() + vector: Vector(voyageai.ndims()) = voyageai.VectorField() + + df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) + db = lancedb.connect("~/lancedb") + tbl = db.create_table("test_multimodal_35", schema=TextModel, mode="overwrite") + + tbl.add(df) + assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims() + assert voyageai.ndims() == 1024 + + +@pytest.mark.slow +@pytest.mark.skipif( + os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set" +) +def test_voyageai_multimodal_35_flexible_dimensions(): + """Test voyage-multimodal-3.5 model with custom output dimension.""" + voyageai = ( + get_registry() + .get("voyageai") + .create(name="voyage-multimodal-3.5", output_dimension=512, max_retries=0) + ) + + class TextModel(LanceModel): + text: str = voyageai.SourceField() + vector: Vector(voyageai.ndims()) = voyageai.VectorField() + + assert voyageai.ndims() == 512 + + df = pd.DataFrame({"text": ["hello world", "goodbye world"]}) + db = lancedb.connect("~/lancedb") + tbl = db.create_table("test_multimodal_35_dim", schema=TextModel, mode="overwrite") + + tbl.add(df) + assert len(tbl.to_pandas()["vector"][0]) == 512 + + +@pytest.mark.slow +@pytest.mark.skipif( + os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set" +) +def test_voyageai_multimodal_35_image_embedding(): + """Test voyage-multimodal-3.5 model with image input.""" + voyageai = ( + get_registry() + .get("voyageai") + .create(name="voyage-multimodal-3.5", max_retries=0) + ) + + class Images(LanceModel): + label: str + image_uri: str = voyageai.SourceField() + vector: Vector(voyageai.ndims()) = voyageai.VectorField() + + db = lancedb.connect("~/lancedb") + table = db.create_table( + "test_multimodal_35_images", schema=Images, mode="overwrite" + ) + labels = ["cat", "dog"] + uris = [ + "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg", + "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg", + ] + table.add(pd.DataFrame({"label": labels, "image_uri": uris})) + assert len(table.to_pandas()["vector"][0]) == voyageai.ndims() + assert voyageai.ndims() == 1024 + + +@pytest.mark.slow +@pytest.mark.skipif( + os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set" +) +@pytest.mark.parametrize("dimension", [256, 512, 1024, 2048]) +def test_voyageai_multimodal_35_all_dimensions(dimension): + """Test voyage-multimodal-3.5 model with all valid output dimensions.""" + voyageai = ( + get_registry() + .get("voyageai") + .create(name="voyage-multimodal-3.5", output_dimension=dimension, max_retries=0) + ) + + assert voyageai.ndims() == dimension + + class TextModel(LanceModel): + text: str = voyageai.SourceField() + vector: Vector(voyageai.ndims()) = voyageai.VectorField() + + df = pd.DataFrame({"text": ["hello world"]}) + db = lancedb.connect("~/lancedb") + tbl = db.create_table( + f"test_multimodal_35_dim_{dimension}", schema=TextModel, mode="overwrite" + ) + + tbl.add(df) + assert len(tbl.to_pandas()["vector"][0]) == dimension + + +@pytest.mark.slow +@pytest.mark.skipif( + os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set" +) +def test_voyageai_multimodal_35_invalid_dimension(): + """Test voyage-multimodal-3.5 model raises error for invalid output dimension.""" + with pytest.raises(ValueError, match="Invalid output_dimension"): + voyageai = ( + get_registry() + .get("voyageai") + .create(name="voyage-multimodal-3.5", output_dimension=999, max_retries=0) + ) + # ndims() is where the validation happens + voyageai.ndims() + + @pytest.mark.slow @pytest.mark.skipif( importlib.util.find_spec("colpali_engine") is None,