mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-05 14:10:41 +00:00
fix: table.add(mode='overwrite') infers vector column types (#3184)
Fixes #3183 ## Summary When `table.add(mode='overwrite')` is called, PyArrow infers input data types (e.g. `list<double>`) which differ from the original table schema (e.g. `fixed_size_list<float32>`). Previously, overwrite mode bypassed `cast_to_table_schema()` entirely, so the inferred types replaced the original schema, breaking vector search. This fix builds a merged target schema for overwrite: columns present in the existing table schema keep their original types, while columns unique to the input pass through as-is. This way `cast_to_table_schema()` is applied unconditionally, preserving vector column types without blocking schema evolution. ## Changes - `rust/lancedb/src/table/add_data.rs`: For overwrite mode, construct a target schema by matching input columns against the existing table schema, then cast. Non-overwrite (append) path is unchanged. - Added `test_add_overwrite_preserves_vector_type` test that creates a table with `fixed_size_list<float32>`, overwrites with `list<double>` input, and asserts the original type is preserved. ## Test Plan - `cargo test --features remote -p lancedb -- test_add_overwrite` — all 4 overwrite tests pass - Full suite: 454 passed, 2 failed (pre-existing `remote::retry` flakes unrelated to this change) --------- Signed-off-by: majiayu000 <1835304752@qq.com>
This commit is contained in:
@@ -3857,7 +3857,13 @@ class AsyncTable:
|
||||
|
||||
# _santitize_data is an old code path, but we will use it until the
|
||||
# new code path is ready.
|
||||
if on_bad_vectors != "error" or (
|
||||
if mode == "overwrite":
|
||||
# For overwrite, apply the same preprocessing as create_table
|
||||
# so vector columns are inferred as FixedSizeList.
|
||||
data, _ = sanitize_create_table(
|
||||
data, None, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||
)
|
||||
elif on_bad_vectors != "error" or (
|
||||
schema.metadata is not None and b"embedding_functions" in schema.metadata
|
||||
):
|
||||
data = _sanitize_data(
|
||||
|
||||
@@ -527,6 +527,36 @@ async def test_add_async(mem_db_async: AsyncConnection):
|
||||
assert await table.count_rows() == 3
|
||||
|
||||
|
||||
def test_add_overwrite_infers_vector_schema(mem_db: DBConnection):
|
||||
"""Overwrite should infer vector columns the same way create_table does.
|
||||
|
||||
Regression test for https://github.com/lancedb/lancedb/issues/3183
|
||||
"""
|
||||
table = mem_db.create_table(
|
||||
"test_overwrite_vec",
|
||||
data=[
|
||||
{"vector": [1.0, 2.0, 3.0, 4.0], "item": "foo"},
|
||||
{"vector": [5.0, 6.0, 7.0, 8.0], "item": "bar"},
|
||||
],
|
||||
)
|
||||
# create_table infers vector as fixed_size_list<float32, 4>
|
||||
original_type = table.schema.field("vector").type
|
||||
assert pa.types.is_fixed_size_list(original_type)
|
||||
|
||||
# overwrite with plain Python lists (PyArrow infers list<double>)
|
||||
table.add(
|
||||
[
|
||||
{"vector": [10.0, 20.0, 30.0, 40.0], "item": "baz"},
|
||||
],
|
||||
mode="overwrite",
|
||||
)
|
||||
# overwrite should infer vector column the same way as create_table
|
||||
new_type = table.schema.field("vector").type
|
||||
assert pa.types.is_fixed_size_list(new_type), (
|
||||
f"Expected fixed_size_list after overwrite, got {new_type}"
|
||||
)
|
||||
|
||||
|
||||
def test_add_progress_callback(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
"test",
|
||||
|
||||
Reference in New Issue
Block a user