fix: handle empty/wrong-length vectors returned by embedding functions (#3192)

## Summary

- When an embedding function returns an empty list (e.g. `[]`) for an
input row — as can happen when a model produces no output for a blank
string — `_append_vector_columns` crashed with `ArrowInvalid: Length of
item not correct: expected N but got array of size 0` because PyArrow
cannot fit a zero-length value into a fixed-size list element.
- The fix adds a validation step in `gen()`, inside
`_append_vector_columns`, that replaces any vector whose length does not
match the expected `ndims` (including empty lists and `None`) with
`None` before `pa.array()` is called.
- `None` is a valid null in a PyArrow fixed-size list array, so the bad
entry flows into `_handle_bad_vectors` and is handled according to the
caller-supplied `on_bad_vectors` policy (`error` / `drop` / `fill` /
`null`) instead of causing an unconditional crash.

## Test plan

- [ ] Added `test_embedding_with_empty_output_vectors` in
`python/python/tests/test_embeddings.py` that uses an embedding function
returning `[]` for empty-string inputs, calls `table.add(...,
on_bad_vectors="drop")`, and asserts no crash and that bad rows are
correctly dropped.
- [ ] Existing `test_embedding_with_bad_results` continues to pass (NaN
vectors still handled correctly).
- [ ] Verified manually that `pa.array([[1.,2.,3.,4.], []],
type=pa.list_(pa.float32(), 4))` raises `ArrowInvalid` without the fix,
and succeeds with `None` in place of `[]`.

Fixes #1672

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric B
2026-07-02 13:31:16 -07:00
committed by GitHub
parent 37466a0390
commit e6661a7285
2 changed files with 63 additions and 1 deletions

View File

@@ -651,6 +651,16 @@ def _append_vector_columns(
col_data = func.compute_source_embeddings_with_retry(
batch[conf.source_column]
)
# Replace vectors with wrong length (including empty lists
# returned for inputs like empty strings) with None so that
# _handle_bad_vectors can process them according to the
# on_bad_vectors policy instead of crashing when PyArrow
# tries to cast them into a fixed-size list array.
expected_ndims = conf.function.ndims()
col_data = [
v if v is not None and len(v) == expected_ndims else None
for v in col_data
]
if no_vector_column:
batch = batch.append_column(
schema.field(vector_column),
@@ -4020,7 +4030,16 @@ def _handle_bad_vector_column(
dim = _infer_vector_dim(vec_arr)
if dim is None:
return data
has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim)
is_null = pc.is_null(vec_arr)
# pc.list_value_length returns null for null list entries, so
# pc.not_equal(null, dim) also returns null. Use or_kleene so that
# True OR null = True (Kleene three-valued logic), ensuring null vectors
# are counted as wrong-dim.
has_wrong_dim = pc.or_kleene(
is_null,
pc.not_equal(pc.list_value_length(vec_arr), dim),
)
has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py()

View File

@@ -242,6 +242,49 @@ def test_embedding_with_bad_results(tmp_path):
assert tbl["vector"].null_count == 1
def test_embedding_with_empty_output_vectors(tmp_path):
"""Regression test for issue #1672.
When an embedding function returns an empty list (e.g. for empty-string
inputs), _append_vector_columns used to crash because PyArrow cannot cast
[] into a fixed-size list element. The fix replaces wrong-length vectors
with None before building the Arrow array so that _handle_bad_vectors can
process them normally.
"""
@register("empty-vec-embedding")
class EmptyVecEmbeddingFunction(TextEmbeddingFunction):
def ndims(self):
return 128
def generate_embeddings(self, texts: Union[List[str], np.ndarray]) -> list:
# Simulate a model that returns an empty list for blank inputs
return [
[] if text.strip() == "" else np.random.randn(self.ndims()).tolist()
for text in texts
]
db = lancedb.connect(tmp_path)
registry = EmbeddingFunctionRegistry.get_instance()
model = registry.get("empty-vec-embedding").create()
class Schema(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
table = db.create_table("test_empty_vec", schema=Schema, mode="overwrite")
# Should not crash; the row with the empty string should be dropped
table.add(
[{"text": "hello world"}, {"text": ""}, {"text": "foo"}],
on_bad_vectors="drop",
)
assert len(table) == 2
texts = table.to_arrow()["text"].to_pylist()
assert "" not in texts
def test_with_existing_vectors(tmp_path):
@register("mock-embedding")
class MockEmbeddingFunction(TextEmbeddingFunction):