From 2adb10e6a899799488ffa4538f5e3fcc78add637 Mon Sep 17 00:00:00 2001
From: fzowl <160063452+fzowl@users.noreply.github.com>
Date: Sat, 3 Jan 2026 00:14:52 +0100
Subject: [PATCH] feat: voyage-multimodal-3.5 (#2887)

voyage-multimodal-3.5 support (text, image and video embeddings)
---
 .../voyageai_multimodal_embedding.md          | 111 +++++++++++++++
 python/python/lancedb/embeddings/voyageai.py  |  83 ++++++++++--
 python/python/tests/test_embeddings_slow.py   | 127 ++++++++++++++++++
 3 files changed, 308 insertions(+), 13 deletions(-)
 create mode 100644 docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md

diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md
new file mode 100644
index 00000000..14141a8a
--- /dev/null
+++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/voyageai_multimodal_embedding.md
@@ -0,0 +1,111 @@
+# VoyageAI Embeddings : Multimodal
+
+VoyageAI embeddings can also be used to embed both text and image data, only some of the models support image data and you can check the list
+under [https://docs.voyageai.com/docs/multimodal-embeddings](https://docs.voyageai.com/docs/multimodal-embeddings)
+
+Supported multimodal models:
+
+- `voyage-multimodal-3` - 1024 dimensions (text + images)
+- `voyage-multimodal-3.5` - Flexible dimensions (256, 512, 1024 default, 2048). Supports text, images, and video.
+
+### Video Support (voyage-multimodal-3.5)
+
+The `voyage-multimodal-3.5` model supports video input through:
+- Video URLs (`.mp4`, `.webm`, `.mov`, `.avi`, `.mkv`, `.m4v`, `.gif`)
+- Video file paths
+
+Constraints: Max 20MB video size.
+
+Supported parameters (to be passed in `create` method) are:
+
+| Parameter | Type | Default Value           | Description                               |
+|---|---|-------------------------|-------------------------------------------|
+| `name` | `str` | `"voyage-multimodal-3"` | The model ID of the VoyageAI model to use |
+| `output_dimension` | `int` | `None` | Output dimension for voyage-multimodal-3.5. Valid: 256, 512, 1024, 2048 |
+
+Usage Example:
+
+```python
+import base64
+import os
+from io import BytesIO
+
+import requests
+import lancedb
+from lancedb.pydantic import LanceModel, Vector
+from lancedb.embeddings import get_registry
+import pandas as pd
+
+os.environ['VOYAGE_API_KEY'] = 'YOUR_VOYAGE_API_KEY'
+
+db = lancedb.connect(".lancedb")
+func = get_registry().get("voyageai").create(name="voyage-multimodal-3")
+
+
+def image_to_base64(image_bytes: bytes):
+    buffered = BytesIO(image_bytes)
+    img_str = base64.b64encode(buffered.getvalue())
+    return img_str.decode("utf-8")
+
+
+class Images(LanceModel):
+    label: str
+    image_uri: str = func.SourceField()  # image uri as the source
+    image_bytes: str = func.SourceField()  # image bytes base64 encoded as the source
+    vector: Vector(func.ndims()) = func.VectorField()  # vector column
+    vec_from_bytes: Vector(func.ndims()) = func.VectorField()  # Another vector column
+
+
+if "images" in db.table_names():
+    db.drop_table("images")
+table = db.create_table("images", schema=Images)
+labels = ["cat", "cat", "dog", "dog", "horse", "horse"]
+uris = [
+    "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
+    "http://farm1.staticflickr.com/134/332220238_da527d8140_z.jpg",
+    "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
+    "http://farm5.staticflickr.com/4092/5017326486_1f46057f5f_z.jpg",
+    "http://farm9.staticflickr.com/8216/8434969557_d37882c42d_z.jpg",
+    "http://farm6.staticflickr.com/5142/5835678453_4f3a4edb45_z.jpg",
+]
+# get each uri as bytes
+images_bytes = [image_to_base64(requests.get(uri).content) for uri in uris]
+table.add(
+    pd.DataFrame({"label": labels, "image_uri": uris, "image_bytes": images_bytes})
+)
+```
+Now we can search using text from both the default vector column and the custom vector column
+```python
+
+# text search
+actual = table.search("man's best friend", "vec_from_bytes").limit(1).to_pydantic(Images)[0]
+print(actual.label) # prints "dog"
+
+frombytes = (
+    table.search("man's best friend", vector_column_name="vec_from_bytes")
+    .limit(1)
+    .to_pydantic(Images)[0]
+)
+print(frombytes.label)
+
+```
+
+Because we're using a multi-modal embedding function, we can also search using images
+
+```python
+# image search
+query_image_uri = "http://farm1.staticflickr.com/200/467715466_ed4a31801f_z.jpg"
+image_bytes = requests.get(query_image_uri).content
+query_image = Image.open(BytesIO(image_bytes))
+actual = table.search(query_image, "vec_from_bytes").limit(1).to_pydantic(Images)[0]
+print(actual.label == "dog")
+
+# image search using a custom vector column
+other = (
+    table.search(query_image, vector_column_name="vec_from_bytes")
+    .limit(1)
+    .to_pydantic(Images)[0]
+)
+print(actual.label)
+
+```
diff --git a/python/python/lancedb/embeddings/voyageai.py b/python/python/lancedb/embeddings/voyageai.py
index fef71174..a34756fc 100644
--- a/python/python/lancedb/embeddings/voyageai.py
+++ b/python/python/lancedb/embeddings/voyageai.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright The LanceDB Authors
 import base64
 import os
-from typing import ClassVar, TYPE_CHECKING, List, Union, Any, Generator
+from typing import ClassVar, TYPE_CHECKING, List, Union, Any, Generator, Optional
 
 from pathlib import Path
 from urllib.parse import urlparse
@@ -45,11 +45,29 @@ def is_valid_url(text):
         return False
 
 
+VIDEO_EXTENSIONS = {".mp4", ".webm", ".mov", ".avi", ".mkv", ".m4v", ".gif"}
+
+
+def is_video_url(url: str) -> bool:
+    """Check if URL points to a video file based on extension."""
+    parsed = urlparse(url)
+    path = parsed.path.lower()
+    return any(path.endswith(ext) for ext in VIDEO_EXTENSIONS)
+
+
+def is_video_path(path: Path) -> bool:
+    """Check if file path is a video file based on extension."""
+    return path.suffix.lower() in VIDEO_EXTENSIONS
+
+
 def transform_input(input_data: Union[str, bytes, Path]):
     PIL = attempt_import_or_raise("PIL", "pillow")
     if isinstance(input_data, str):
         if is_valid_url(input_data):
-            content = {"type": "image_url", "image_url": input_data}
+            if is_video_url(input_data):
+                content = {"type": "video_url", "video_url": input_data}
+            else:
+                content = {"type": "image_url", "image_url": input_data}
         else:
             content = {"type": "text", "text": input_data}
     elif isinstance(input_data, PIL.Image.Image):
@@ -70,14 +88,24 @@ def transform_input(input_data: Union[str, bytes, Path]):
             "image_base64": "data:image/jpeg;base64," + img_str,
         }
     elif isinstance(input_data, Path):
-        img = PIL.Image.open(input_data)
-        buffered = BytesIO()
-        img.save(buffered, format="JPEG")
-        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-        content = {
-            "type": "image_base64",
-            "image_base64": "data:image/jpeg;base64," + img_str,
-        }
+        if is_video_path(input_data):
+            # Read video file and encode as base64
+            with open(input_data, "rb") as f:
+                video_bytes = f.read()
+            video_str = base64.b64encode(video_bytes).decode("utf-8")
+            content = {
+                "type": "video_base64",
+                "video_base64": video_str,
+            }
+        else:
+            img = PIL.Image.open(input_data)
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            content = {
+                "type": "image_base64",
+                "image_base64": "data:image/jpeg;base64," + img_str,
+            }
     else:
         raise ValueError("Each input should be either str, bytes, Path or Image.")
 
@@ -91,6 +119,8 @@ def sanitize_multimodal_input(inputs: Union[TEXT, IMAGES]) -> List[Any]:
     PIL = attempt_import_or_raise("PIL", "pillow")
     if isinstance(inputs, (str, bytes, Path, PIL.Image.Image)):
         inputs = [inputs]
+    elif isinstance(inputs, list):
+        pass  # Already a list, use as-is
     elif isinstance(inputs, pa.Array):
         inputs = inputs.to_pylist()
     elif isinstance(inputs, pa.ChunkedArray):
@@ -143,11 +173,16 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
             * voyage-3
             * voyage-3-lite
             * voyage-multimodal-3
+            * voyage-multimodal-3.5
             * voyage-finance-2
             * voyage-multilingual-2
             * voyage-law-2
             * voyage-code-2
 
+    output_dimension: int, optional
+        The output dimension for models that support flexible dimensions.
+        Currently only voyage-multimodal-3.5 supports this feature.
+        Valid options: 256, 512, 1024 (default), 2048.
 
     Examples
     --------
@@ -175,7 +210,10 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
     """
 
     name: str
+    output_dimension: Optional[int] = None
     client: ClassVar = None
+    _FLEXIBLE_DIM_MODELS: ClassVar[list] = ["voyage-multimodal-3.5"]
+    _VALID_DIMENSIONS: ClassVar[list] = [256, 512, 1024, 2048]
     text_embedding_models: list = [
         "voyage-3.5",
         "voyage-3.5-lite",
@@ -186,7 +224,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
         "voyage-law-2",
         "voyage-code-2",
     ]
-    multimodal_embedding_models: list = ["voyage-multimodal-3"]
+    multimodal_embedding_models: list = ["voyage-multimodal-3", "voyage-multimodal-3.5"]
     contextual_embedding_models: list = ["voyage-context-3"]
 
     def _is_multimodal_model(self, model_name: str):
@@ -198,6 +236,17 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
         return model_name in self.contextual_embedding_models or "context" in model_name
 
     def ndims(self):
+        # Handle flexible dimension models
+        if self.name in self._FLEXIBLE_DIM_MODELS:
+            if self.output_dimension is not None:
+                if self.output_dimension not in self._VALID_DIMENSIONS:
+                    raise ValueError(
+                        f"Invalid output_dimension {self.output_dimension} "
+                        f"for {self.name}. Valid options: {self._VALID_DIMENSIONS}"
+                    )
+                return self.output_dimension
+            return 1024  # default dimension
+
         if self.name == "voyage-3-lite":
             return 512
         elif self.name == "voyage-code-2":
@@ -211,12 +260,17 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
             "voyage-finance-2",
             "voyage-multilingual-2",
             "voyage-law-2",
-            "voyage-multimodal-3",
         ]:
             return 1024
         else:
             raise ValueError(f"Model {self.name} not supported")
 
+    def _get_multimodal_kwargs(self, **kwargs):
+        """Get kwargs for multimodal embed call, including output_dimension if set."""
+        if self.name in self._FLEXIBLE_DIM_MODELS and self.output_dimension is not None:
+            kwargs["output_dimension"] = self.output_dimension
+        return kwargs
+
     def compute_query_embeddings(
         self, query: Union[str, "PIL.Image.Image"], *args, **kwargs
     ) -> List[np.ndarray]:
@@ -234,6 +288,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
         """
         client = VoyageAIEmbeddingFunction._get_client()
         if self._is_multimodal_model(self.name):
+            kwargs = self._get_multimodal_kwargs(**kwargs)
             result = client.multimodal_embed(
                 inputs=[[query]], model=self.name, input_type="query", **kwargs
             )
@@ -275,6 +330,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
             )
             if has_images:
                 # Use non-batched API for images
+                kwargs = self._get_multimodal_kwargs(**kwargs)
                 result = client.multimodal_embed(
                     inputs=sanitized, model=self.name, input_type="document", **kwargs
                 )
@@ -357,6 +413,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
             callable: A function that takes a batch of texts and returns embeddings.
         """
         if self._is_multimodal_model(self.name):
+            multimodal_kwargs = self._get_multimodal_kwargs(**kwargs)
 
             def embed_batch(batch: List[str]) -> List[np.array]:
                 batch_inputs = sanitize_multimodal_input(batch)
@@ -364,7 +421,7 @@ class VoyageAIEmbeddingFunction(EmbeddingFunction):
                     inputs=batch_inputs,
                     model=self.name,
                     input_type=input_type,
-                    **kwargs,
+                    **multimodal_kwargs,
                 )
                 return result.embeddings
 
diff --git a/python/python/tests/test_embeddings_slow.py b/python/python/tests/test_embeddings_slow.py
index 966ac1c6..e58d2daa 100644
--- a/python/python/tests/test_embeddings_slow.py
+++ b/python/python/tests/test_embeddings_slow.py
@@ -613,6 +613,133 @@ def test_voyageai_multimodal_embedding_text_function():
     assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
 
 
+@pytest.mark.slow
+@pytest.mark.skipif(
+    os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
+)
+def test_voyageai_multimodal_35_embedding_function():
+    """Test voyage-multimodal-3.5 model with text input."""
+    voyageai = (
+        get_registry()
+        .get("voyageai")
+        .create(name="voyage-multimodal-3.5", max_retries=0)
+    )
+
+    class TextModel(LanceModel):
+        text: str = voyageai.SourceField()
+        vector: Vector(voyageai.ndims()) = voyageai.VectorField()
+
+    df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
+    db = lancedb.connect("~/lancedb")
+    tbl = db.create_table("test_multimodal_35", schema=TextModel, mode="overwrite")
+
+    tbl.add(df)
+    assert len(tbl.to_pandas()["vector"][0]) == voyageai.ndims()
+    assert voyageai.ndims() == 1024
+
+
+@pytest.mark.slow
+@pytest.mark.skipif(
+    os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
+)
+def test_voyageai_multimodal_35_flexible_dimensions():
+    """Test voyage-multimodal-3.5 model with custom output dimension."""
+    voyageai = (
+        get_registry()
+        .get("voyageai")
+        .create(name="voyage-multimodal-3.5", output_dimension=512, max_retries=0)
+    )
+
+    class TextModel(LanceModel):
+        text: str = voyageai.SourceField()
+        vector: Vector(voyageai.ndims()) = voyageai.VectorField()
+
+    assert voyageai.ndims() == 512
+
+    df = pd.DataFrame({"text": ["hello world", "goodbye world"]})
+    db = lancedb.connect("~/lancedb")
+    tbl = db.create_table("test_multimodal_35_dim", schema=TextModel, mode="overwrite")
+
+    tbl.add(df)
+    assert len(tbl.to_pandas()["vector"][0]) == 512
+
+
+@pytest.mark.slow
+@pytest.mark.skipif(
+    os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
+)
+def test_voyageai_multimodal_35_image_embedding():
+    """Test voyage-multimodal-3.5 model with image input."""
+    voyageai = (
+        get_registry()
+        .get("voyageai")
+        .create(name="voyage-multimodal-3.5", max_retries=0)
+    )
+
+    class Images(LanceModel):
+        label: str
+        image_uri: str = voyageai.SourceField()
+        vector: Vector(voyageai.ndims()) = voyageai.VectorField()
+
+    db = lancedb.connect("~/lancedb")
+    table = db.create_table(
+        "test_multimodal_35_images", schema=Images, mode="overwrite"
+    )
+    labels = ["cat", "dog"]
+    uris = [
+        "http://farm1.staticflickr.com/53/167798175_7c7845bbbd_z.jpg",
+        "http://farm9.staticflickr.com/8387/8602747737_2e5c2a45d4_z.jpg",
+    ]
+    table.add(pd.DataFrame({"label": labels, "image_uri": uris}))
+    assert len(table.to_pandas()["vector"][0]) == voyageai.ndims()
+    assert voyageai.ndims() == 1024
+
+
+@pytest.mark.slow
+@pytest.mark.skipif(
+    os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
+)
+@pytest.mark.parametrize("dimension", [256, 512, 1024, 2048])
+def test_voyageai_multimodal_35_all_dimensions(dimension):
+    """Test voyage-multimodal-3.5 model with all valid output dimensions."""
+    voyageai = (
+        get_registry()
+        .get("voyageai")
+        .create(name="voyage-multimodal-3.5", output_dimension=dimension, max_retries=0)
+    )
+
+    assert voyageai.ndims() == dimension
+
+    class TextModel(LanceModel):
+        text: str = voyageai.SourceField()
+        vector: Vector(voyageai.ndims()) = voyageai.VectorField()
+
+    df = pd.DataFrame({"text": ["hello world"]})
+    db = lancedb.connect("~/lancedb")
+    tbl = db.create_table(
+        f"test_multimodal_35_dim_{dimension}", schema=TextModel, mode="overwrite"
+    )
+
+    tbl.add(df)
+    assert len(tbl.to_pandas()["vector"][0]) == dimension
+
+
+@pytest.mark.slow
+@pytest.mark.skipif(
+    os.environ.get("VOYAGE_API_KEY") is None, reason="VOYAGE_API_KEY not set"
+)
+def test_voyageai_multimodal_35_invalid_dimension():
+    """Test voyage-multimodal-3.5 model raises error for invalid output dimension."""
+    with pytest.raises(ValueError, match="Invalid output_dimension"):
+        voyageai = (
+            get_registry()
+            .get("voyageai")
+            .create(name="voyage-multimodal-3.5", output_dimension=999, max_retries=0)
+        )
+        # ndims() is where the validation happens
+        voyageai.ndims()
+
+
 @pytest.mark.slow
 @pytest.mark.skipif(
     importlib.util.find_spec("colpali_engine") is None,