fix: handle empty/wrong-length vectors returned by embedding functions (#3192)

## Summary - When an embedding function returns an empty list (e.g. `[]`) for an input row — as can happen when a model produces no output for a blank string — `_append_vector_columns` crashed with `ArrowInvalid: Length of item not correct: expected N but got array of size 0` because PyArrow cannot fit a zero-length value into a fixed-size list element. - The fix adds a validation step in `gen()`, inside `_append_vector_columns`, that replaces any vector whose length does not match the expected `ndims` (including empty lists and `None`) with `None` before `pa.array()` is called. - `None` is a valid null in a PyArrow fixed-size list array, so the bad entry flows into `_handle_bad_vectors` and is handled according to the caller-supplied `on_bad_vectors` policy (`error` / `drop` / `fill` / `null`) instead of causing an unconditional crash. ## Test plan - [ ] Added `test_embedding_with_empty_output_vectors` in `python/python/tests/test_embeddings.py` that uses an embedding function returning `[]` for empty-string inputs, calls `table.add(..., on_bad_vectors="drop")`, and asserts no crash and that bad rows are correctly dropped. - [ ] Existing `test_embedding_with_bad_results` continues to pass (NaN vectors still handled correctly). - [ ] Verified manually that `pa.array([[1.,2.,3.,4.], []], type=pa.list_(pa.float32(), 4))` raises `ArrowInvalid` without the fix, and succeeds with `None` in place of `[]`. Fixes #1672 --------- Co-authored-by: Will Jones <willjones127@gmail.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-03 11:00:40 +00:00 · 2026-07-02 13:31:16 -07:00
parent 37466a0390
commit e6661a7285
2 changed files with 63 additions and 1 deletions
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -651,6 +651,16 @@ def _append_vector_columns(
                    col_data = func.compute_source_embeddings_with_retry(
                        batch[conf.source_column]
                    )
+                    # Replace vectors with wrong length (including empty lists
+                    # returned for inputs like empty strings) with None so that
+                    # _handle_bad_vectors can process them according to the
+                    # on_bad_vectors policy instead of crashing when PyArrow
+                    # tries to cast them into a fixed-size list array.
+                    expected_ndims = conf.function.ndims()
+                    col_data = [
+                        v if v is not None and len(v) == expected_ndims else None
+                        for v in col_data
+                    ]
                    if no_vector_column:
                        batch = batch.append_column(
                            schema.field(vector_column),
@@ -4020,7 +4030,16 @@ def _handle_bad_vector_column(
        dim = _infer_vector_dim(vec_arr)
        if dim is None:
            return data
-    has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim)
+
+    is_null = pc.is_null(vec_arr)
+    # pc.list_value_length returns null for null list entries, so
+    # pc.not_equal(null, dim) also returns null. Use or_kleene so that
+    # True OR null = True (Kleene three-valued logic), ensuring null vectors
+    # are counted as wrong-dim.
+    has_wrong_dim = pc.or_kleene(
+        is_null,
+        pc.not_equal(pc.list_value_length(vec_arr), dim),
+    )

    has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py()

--- a/python/python/tests/test_embeddings.py
+++ b/python/python/tests/test_embeddings.py
@@ -242,6 +242,49 @@ def test_embedding_with_bad_results(tmp_path):
    assert tbl["vector"].null_count == 1


+def test_embedding_with_empty_output_vectors(tmp_path):
+    """Regression test for issue #1672.
+
+    When an embedding function returns an empty list (e.g. for empty-string
+    inputs), _append_vector_columns used to crash because PyArrow cannot cast
+    [] into a fixed-size list element.  The fix replaces wrong-length vectors
+    with None before building the Arrow array so that _handle_bad_vectors can
+    process them normally.
+    """
+
+    @register("empty-vec-embedding")
+    class EmptyVecEmbeddingFunction(TextEmbeddingFunction):
+        def ndims(self):
+            return 128
+
+        def generate_embeddings(self, texts: Union[List[str], np.ndarray]) -> list:
+            # Simulate a model that returns an empty list for blank inputs
+            return [
+                [] if text.strip() == "" else np.random.randn(self.ndims()).tolist()
+                for text in texts
+            ]
+
+    db = lancedb.connect(tmp_path)
+    registry = EmbeddingFunctionRegistry.get_instance()
+    model = registry.get("empty-vec-embedding").create()
+
+    class Schema(LanceModel):
+        text: str = model.SourceField()
+        vector: Vector(model.ndims()) = model.VectorField()
+
+    table = db.create_table("test_empty_vec", schema=Schema, mode="overwrite")
+
+    # Should not crash; the row with the empty string should be dropped
+    table.add(
+        [{"text": "hello world"}, {"text": ""}, {"text": "foo"}],
+        on_bad_vectors="drop",
+    )
+
+    assert len(table) == 2
+    texts = table.to_arrow()["text"].to_pylist()
+    assert "" not in texts
+
+
 def test_with_existing_vectors(tmp_path):
    @register("mock-embedding")
    class MockEmbeddingFunction(TextEmbeddingFunction):