fix: discover nested vector columns by default (#3423)

LanceDB default vector column discovery only considered top-level
fields, so tables with a single nested vector leaf still required users
to pass an explicit field path. This updates Rust and Python discovery
to recurse into struct fields, return canonical field paths, and
preserve actionable errors when no default or multiple defaults exist.

The explicit nested path flow for index creation and search remains
supported across Rust, Python, and Node, with regression coverage for
single nested vector leaves, multiple candidate leaves, and schemas
without vector leaves.

Closes #3405.
This commit is contained in:
Xuanwo
2026-05-21 19:02:41 +08:00
committed by GitHub
parent 55ae6197c1
commit d5dc4c0f06
6 changed files with 415 additions and 34 deletions

View File

@@ -1934,6 +1934,10 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
assert len(vector_results) == 1
assert vector_results[0]["metadata"]["user_id"] == 0
default_vector_results = table.search([0.0, 1.0]).limit(1).to_list()
assert len(default_vector_results) == 1
assert default_vector_results[0]["metadata"]["user_id"] == 0
filtered_results = table.search().where("metadata.user_id = 42").limit(1).to_list()
assert len(filtered_results) == 1
assert filtered_results[0]["metadata"]["user_id"] == 42
@@ -2013,6 +2017,74 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
table.search(q).limit(1).to_arrow()
def test_search_infers_single_nested_vector(mem_db: DBConnection):
schema = pa.schema(
[
pa.field("id", pa.int32()),
pa.field(
"image",
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
),
]
)
data = pa.Table.from_pylist(
[
{"id": 0, "image": {"embedding": [0.0, 1.0]}},
{"id": 1, "image": {"embedding": [10.0, 11.0]}},
],
schema=schema,
)
table = mem_db.create_table("nested_vector_default_search", data=data)
result = table.search([0.0, 1.0]).limit(1).to_list()
assert result[0]["id"] == 0
def test_search_nested_vector_multiple_candidates(mem_db: DBConnection):
schema = pa.schema(
[
pa.field(
"image",
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
),
pa.field(
"text",
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
),
]
)
data = pa.Table.from_pylist(
[
{
"image": {"embedding": [0.0, 1.0]},
"text": {"embedding": [2.0, 3.0]},
}
],
schema=schema,
)
table = mem_db.create_table("nested_vector_multiple_candidates", data=data)
with pytest.raises(ValueError, match="image.embedding.*text.embedding"):
table.search([0.0, 1.0]).limit(1).to_arrow()
def test_search_nested_vector_no_candidates(mem_db: DBConnection):
schema = pa.schema(
[
pa.field("id", pa.int32()),
pa.field("metadata", pa.struct([pa.field("label", pa.string())])),
]
)
data = pa.Table.from_pylist(
[{"id": 0, "metadata": {"label": "cat"}}],
schema=schema,
)
table = mem_db.create_table("nested_vector_no_candidates", data=data)
with pytest.raises(ValueError, match="no vector column"):
table.search([0.0, 1.0]).limit(1).to_arrow()
def test_compact_cleanup(tmp_db: DBConnection):
pytest.importorskip("lance")
table = tmp_db.create_table(