mirror of
https://github.com/lancedb/lancedb.git
synced 2026-07-03 11:00:40 +00:00
fix: handle empty/wrong-length vectors returned by embedding functions (#3192)
## Summary - When an embedding function returns an empty list (e.g. `[]`) for an input row — as can happen when a model produces no output for a blank string — `_append_vector_columns` crashed with `ArrowInvalid: Length of item not correct: expected N but got array of size 0` because PyArrow cannot fit a zero-length value into a fixed-size list element. - The fix adds a validation step in `gen()`, inside `_append_vector_columns`, that replaces any vector whose length does not match the expected `ndims` (including empty lists and `None`) with `None` before `pa.array()` is called. - `None` is a valid null in a PyArrow fixed-size list array, so the bad entry flows into `_handle_bad_vectors` and is handled according to the caller-supplied `on_bad_vectors` policy (`error` / `drop` / `fill` / `null`) instead of causing an unconditional crash. ## Test plan - [ ] Added `test_embedding_with_empty_output_vectors` in `python/python/tests/test_embeddings.py` that uses an embedding function returning `[]` for empty-string inputs, calls `table.add(..., on_bad_vectors="drop")`, and asserts no crash and that bad rows are correctly dropped. - [ ] Existing `test_embedding_with_bad_results` continues to pass (NaN vectors still handled correctly). - [ ] Verified manually that `pa.array([[1.,2.,3.,4.], []], type=pa.list_(pa.float32(), 4))` raises `ArrowInvalid` without the fix, and succeeds with `None` in place of `[]`. Fixes #1672 --------- Co-authored-by: Will Jones <willjones127@gmail.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -651,6 +651,16 @@ def _append_vector_columns(
|
||||
col_data = func.compute_source_embeddings_with_retry(
|
||||
batch[conf.source_column]
|
||||
)
|
||||
# Replace vectors with wrong length (including empty lists
|
||||
# returned for inputs like empty strings) with None so that
|
||||
# _handle_bad_vectors can process them according to the
|
||||
# on_bad_vectors policy instead of crashing when PyArrow
|
||||
# tries to cast them into a fixed-size list array.
|
||||
expected_ndims = conf.function.ndims()
|
||||
col_data = [
|
||||
v if v is not None and len(v) == expected_ndims else None
|
||||
for v in col_data
|
||||
]
|
||||
if no_vector_column:
|
||||
batch = batch.append_column(
|
||||
schema.field(vector_column),
|
||||
@@ -4020,7 +4030,16 @@ def _handle_bad_vector_column(
|
||||
dim = _infer_vector_dim(vec_arr)
|
||||
if dim is None:
|
||||
return data
|
||||
has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim)
|
||||
|
||||
is_null = pc.is_null(vec_arr)
|
||||
# pc.list_value_length returns null for null list entries, so
|
||||
# pc.not_equal(null, dim) also returns null. Use or_kleene so that
|
||||
# True OR null = True (Kleene three-valued logic), ensuring null vectors
|
||||
# are counted as wrong-dim.
|
||||
has_wrong_dim = pc.or_kleene(
|
||||
is_null,
|
||||
pc.not_equal(pc.list_value_length(vec_arr), dim),
|
||||
)
|
||||
|
||||
has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py()
|
||||
|
||||
|
||||
@@ -242,6 +242,49 @@ def test_embedding_with_bad_results(tmp_path):
|
||||
assert tbl["vector"].null_count == 1
|
||||
|
||||
|
||||
def test_embedding_with_empty_output_vectors(tmp_path):
|
||||
"""Regression test for issue #1672.
|
||||
|
||||
When an embedding function returns an empty list (e.g. for empty-string
|
||||
inputs), _append_vector_columns used to crash because PyArrow cannot cast
|
||||
[] into a fixed-size list element. The fix replaces wrong-length vectors
|
||||
with None before building the Arrow array so that _handle_bad_vectors can
|
||||
process them normally.
|
||||
"""
|
||||
|
||||
@register("empty-vec-embedding")
|
||||
class EmptyVecEmbeddingFunction(TextEmbeddingFunction):
|
||||
def ndims(self):
|
||||
return 128
|
||||
|
||||
def generate_embeddings(self, texts: Union[List[str], np.ndarray]) -> list:
|
||||
# Simulate a model that returns an empty list for blank inputs
|
||||
return [
|
||||
[] if text.strip() == "" else np.random.randn(self.ndims()).tolist()
|
||||
for text in texts
|
||||
]
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
model = registry.get("empty-vec-embedding").create()
|
||||
|
||||
class Schema(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
table = db.create_table("test_empty_vec", schema=Schema, mode="overwrite")
|
||||
|
||||
# Should not crash; the row with the empty string should be dropped
|
||||
table.add(
|
||||
[{"text": "hello world"}, {"text": ""}, {"text": "foo"}],
|
||||
on_bad_vectors="drop",
|
||||
)
|
||||
|
||||
assert len(table) == 2
|
||||
texts = table.to_arrow()["text"].to_pylist()
|
||||
assert "" not in texts
|
||||
|
||||
|
||||
def test_with_existing_vectors(tmp_path):
|
||||
@register("mock-embedding")
|
||||
class MockEmbeddingFunction(TextEmbeddingFunction):
|
||||
|
||||
Reference in New Issue
Block a user