diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 28ee37fbf..f4d70077c 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -651,6 +651,16 @@ def _append_vector_columns( col_data = func.compute_source_embeddings_with_retry( batch[conf.source_column] ) + # Replace vectors with wrong length (including empty lists + # returned for inputs like empty strings) with None so that + # _handle_bad_vectors can process them according to the + # on_bad_vectors policy instead of crashing when PyArrow + # tries to cast them into a fixed-size list array. + expected_ndims = conf.function.ndims() + col_data = [ + v if v is not None and len(v) == expected_ndims else None + for v in col_data + ] if no_vector_column: batch = batch.append_column( schema.field(vector_column), @@ -4020,7 +4030,16 @@ def _handle_bad_vector_column( dim = _infer_vector_dim(vec_arr) if dim is None: return data - has_wrong_dim = pc.not_equal(pc.list_value_length(vec_arr), dim) + + is_null = pc.is_null(vec_arr) + # pc.list_value_length returns null for null list entries, so + # pc.not_equal(null, dim) also returns null. Use or_kleene so that + # True OR null = True (Kleene three-valued logic), ensuring null vectors + # are counted as wrong-dim. + has_wrong_dim = pc.or_kleene( + is_null, + pc.not_equal(pc.list_value_length(vec_arr), dim), + ) has_bad_vectors = pc.any(has_nan).as_py() or pc.any(has_wrong_dim).as_py() diff --git a/python/python/tests/test_embeddings.py b/python/python/tests/test_embeddings.py index 43f7b44ac..c0cdf1c0d 100644 --- a/python/python/tests/test_embeddings.py +++ b/python/python/tests/test_embeddings.py @@ -242,6 +242,49 @@ def test_embedding_with_bad_results(tmp_path): assert tbl["vector"].null_count == 1 +def test_embedding_with_empty_output_vectors(tmp_path): + """Regression test for issue #1672. + + When an embedding function returns an empty list (e.g. for empty-string + inputs), _append_vector_columns used to crash because PyArrow cannot cast + [] into a fixed-size list element. The fix replaces wrong-length vectors + with None before building the Arrow array so that _handle_bad_vectors can + process them normally. + """ + + @register("empty-vec-embedding") + class EmptyVecEmbeddingFunction(TextEmbeddingFunction): + def ndims(self): + return 128 + + def generate_embeddings(self, texts: Union[List[str], np.ndarray]) -> list: + # Simulate a model that returns an empty list for blank inputs + return [ + [] if text.strip() == "" else np.random.randn(self.ndims()).tolist() + for text in texts + ] + + db = lancedb.connect(tmp_path) + registry = EmbeddingFunctionRegistry.get_instance() + model = registry.get("empty-vec-embedding").create() + + class Schema(LanceModel): + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + + table = db.create_table("test_empty_vec", schema=Schema, mode="overwrite") + + # Should not crash; the row with the empty string should be dropped + table.add( + [{"text": "hello world"}, {"text": ""}, {"text": "foo"}], + on_bad_vectors="drop", + ) + + assert len(table) == 2 + texts = table.to_arrow()["text"].to_pylist() + assert "" not in texts + + def test_with_existing_vectors(tmp_path): @register("mock-embedding") class MockEmbeddingFunction(TextEmbeddingFunction):