From e6661a72856eea93444e4200f6c3ea94e075bab2 Mon Sep 17 00:00:00 2001 From: Eric B Date: Thu, 2 Jul 2026 13:31:16 -0700 Subject: [PATCH] fix: handle empty/wrong-length vectors returned by embedding functions (#3192) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - When an embedding function returns an empty list (e.g. `[]`) for an input row — as can happen when a model produces no output for a blank string — `_append_vector_columns` crashed with `ArrowInvalid: Length of item not correct: expected N but got array of size 0` because PyArrow cannot fit a zero-length value into a fixed-size list element. - The fix adds a validation step in `gen()`, inside `_append_vector_columns`, that replaces any vector whose length does not match the expected `ndims` (including empty lists and `None`) with `None` before `pa.array()` is called. - `None` is a valid null in a PyArrow fixed-size list array, so the bad entry flows into `_handle_bad_vectors` and is handled according to the caller-supplied `on_bad_vectors` policy (`error` / `drop` / `fill` / `null`) instead of causing an unconditional crash. ## Test plan - [ ] Added `test_embedding_with_empty_output_vectors` in `python/python/tests/test_embeddings.py` that uses an embedding function returning `[]` for empty-string inputs, calls `table.add(..., on_bad_vectors="drop")`, and asserts no crash and that bad rows are correctly dropped. - [ ] Existing `test_embedding_with_bad_results` continues to pass (NaN vectors still handled correctly). - [ ] Verified manually that `pa.array([[1.,2.,3.,4.], []], type=pa.list_(pa.float32(), 4))` raises `ArrowInvalid` without the fix, and succeeds with `None` in place of `[]`. Fixes #1672 --------- Co-authored-by: Will Jones Co-authored-by: Claude Sonnet 4.6 --- python/python/lancedb/table.py | 21 ++++++++++++- python/python/tests/test_embeddings.py | 43 ++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 28ee37fbf..f4d70077c 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -651,6 +651,16 @@ def _append_vector_columns( col_data = func.compute_source_embeddings_with_retry( batch[conf.source_column] ) + # Replace vectors with wrong length (including empty lists + # returned for inputs like empty strings) with None so that + # _handle_bad_vectors can process them according to the + # on_bad_vectors policy instead of crashing when PyArrow + # tries to cast them into a fixed-size list array. + expected_ndims = conf.function.ndims() + col_data = [ + v if v is not None and len(v) == expected_ndims else None + for v in col_data + ] if no_vector_column: batch = batch.append_column( schema.field(vector_column), @@ -4020,7 +4030,16 @@ def _handle_bad_vector_column( dim = _infer_vector_dim(vec_arr) if dim is None: return data - has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim) + + is_null = pc.is_null(vec_arr) + # pc.list_value_length returns null for null list entries, so + # pc.not_equal(null, dim) also returns null. Use or_kleene so that + # True OR null = True (Kleene three-valued logic), ensuring null vectors + # are counted as wrong-dim. + has_wrong_dim = pc.or_kleene( + is_null, + pc.not_equal(pc.list_value_length(vec_arr), dim), + ) has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py() diff --git a/python/python/tests/test_embeddings.py b/python/python/tests/test_embeddings.py index 43f7b44ac..c0cdf1c0d 100644 --- a/python/python/tests/test_embeddings.py +++ b/python/python/tests/test_embeddings.py @@ -242,6 +242,49 @@ def test_embedding_with_bad_results(tmp_path): assert tbl["vector"].null_count == 1 +def test_embedding_with_empty_output_vectors(tmp_path): + """Regression test for issue #1672. + + When an embedding function returns an empty list (e.g. for empty-string + inputs), _append_vector_columns used to crash because PyArrow cannot cast + [] into a fixed-size list element. The fix replaces wrong-length vectors + with None before building the Arrow array so that _handle_bad_vectors can + process them normally. + """ + + @register("empty-vec-embedding") + class EmptyVecEmbeddingFunction(TextEmbeddingFunction): + def ndims(self): + return 128 + + def generate_embeddings(self, texts: Union[List[str], np.ndarray]) -> list: + # Simulate a model that returns an empty list for blank inputs + return [ + [] if text.strip() == "" else np.random.randn(self.ndims()).tolist() + for text in texts + ] + + db = lancedb.connect(tmp_path) + registry = EmbeddingFunctionRegistry.get_instance() + model = registry.get("empty-vec-embedding").create() + + class Schema(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + + table = db.create_table("test_empty_vec", schema=Schema, mode="overwrite") + + # Should not crash; the row with the empty string should be dropped + table.add( + [{"text": "hello world"}, {"text": ""}, {"text": "foo"}], + on_bad_vectors="drop", + ) + + assert len(table) == 2 + texts = table.to_arrow()["text"].to_pylist() + assert "" not in texts + + def test_with_existing_vectors(tmp_path): @register("mock-embedding") class MockEmbeddingFunction(TextEmbeddingFunction):