fix: table.add(mode='overwrite') infers vector column types (#3184)

Fixes #3183

## Summary

When `table.add(mode='overwrite')` is called, PyArrow infers input data
types (e.g. `list<double>`) which differ from the original table schema
(e.g. `fixed_size_list<float32>`). Previously, overwrite mode bypassed
`cast_to_table_schema()` entirely, so the inferred types replaced the
original schema, breaking vector search.

This fix builds a merged target schema for overwrite: columns present in
the existing table schema keep their original types, while columns
unique to the input pass through as-is. This way
`cast_to_table_schema()` is applied unconditionally, preserving vector
column types without blocking schema evolution.

## Changes

- `rust/lancedb/src/table/add_data.rs`: For overwrite mode, construct a
target schema by matching input columns against the existing table
schema, then cast. Non-overwrite (append) path is unchanged.
- Added `test_add_overwrite_preserves_vector_type` test that creates a
table with `fixed_size_list<float32>`, overwrites with `list<double>`
input, and asserts the original type is preserved.

## Test Plan

- `cargo test --features remote -p lancedb -- test_add_overwrite` — all
4 overwrite tests pass
- Full suite: 454 passed, 2 failed (pre-existing `remote::retry` flakes
unrelated to this change)

---------

Signed-off-by: majiayu000 <1835304752@qq.com>
This commit is contained in:
lif
2026-03-31 01:57:33 +08:00
committed by GitHub
parent 1d1cafb59c
commit 4c44587af0
2 changed files with 37 additions and 1 deletions

View File

@@ -3857,7 +3857,13 @@ class AsyncTable:
# _santitize_data is an old code path, but we will use it until the
# new code path is ready.
if on_bad_vectors != "error" or (
if mode == "overwrite":
# For overwrite, apply the same preprocessing as create_table
# so vector columns are inferred as FixedSizeList.
data, _ = sanitize_create_table(
data, None, on_bad_vectors=on_bad_vectors, fill_value=fill_value
)
elif on_bad_vectors != "error" or (
schema.metadata is not None and b"embedding_functions" in schema.metadata
):
data = _sanitize_data(

View File

@@ -527,6 +527,36 @@ async def test_add_async(mem_db_async: AsyncConnection):
assert await table.count_rows() == 3
def test_add_overwrite_infers_vector_schema(mem_db: DBConnection):
"""Overwrite should infer vector columns the same way create_table does.
Regression test for https://github.com/lancedb/lancedb/issues/3183
"""
table = mem_db.create_table(
"test_overwrite_vec",
data=[
{"vector": [1.0, 2.0, 3.0, 4.0], "item": "foo"},
{"vector": [5.0, 6.0, 7.0, 8.0], "item": "bar"},
],
)
# create_table infers vector as fixed_size_list<float32, 4>
original_type = table.schema.field("vector").type
assert pa.types.is_fixed_size_list(original_type)
# overwrite with plain Python lists (PyArrow infers list<double>)
table.add(
[
{"vector": [10.0, 20.0, 30.0, 40.0], "item": "baz"},
],
mode="overwrite",
)
# overwrite should infer vector column the same way as create_table
new_type = table.schema.field("vector").type
assert pa.types.is_fixed_size_list(new_type), (
f"Expected fixed_size_list after overwrite, got {new_type}"
)
def test_add_progress_callback(mem_db: DBConnection):
table = mem_db.create_table(
"test",