fix: discover nested vector columns by default (#3423)

LanceDB default vector column discovery only considered top-level fields, so tables with a single nested vector leaf still required users to pass an explicit field path. This updates Rust and Python discovery to recurse into struct fields, return canonical field paths, and preserve actionable errors when no default or multiple defaults exist. The explicit nested path flow for index creation and search remains supported across Rust, Python, and Node, with regression coverage for single nested vector leaves, multiple candidate leaves, and schemas without vector leaves. Closes #3405.
2026-05-26 08:20:39 +00:00 · 2026-05-21 19:02:41 +08:00
parent 55ae6197c1
commit d5dc4c0f06
6 changed files with 415 additions and 34 deletions
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -1934,6 +1934,10 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
    assert len(vector_results) == 1
    assert vector_results[0]["metadata"]["user_id"] == 0

+    default_vector_results = table.search([0.0, 1.0]).limit(1).to_list()
+    assert len(default_vector_results) == 1
+    assert default_vector_results[0]["metadata"]["user_id"] == 0
+
    filtered_results = table.search().where("metadata.user_id = 42").limit(1).to_list()
    assert len(filtered_results) == 1
    assert filtered_results[0]["metadata"]["user_id"] == 42
@@ -2013,6 +2017,74 @@ def test_search_with_schema_inf_multiple_vector(mem_db: DBConnection):
        table.search(q).limit(1).to_arrow()


+def test_search_infers_single_nested_vector(mem_db: DBConnection):
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int32()),
+            pa.field(
+                "image",
+                pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
+            ),
+        ]
+    )
+    data = pa.Table.from_pylist(
+        [
+            {"id": 0, "image": {"embedding": [0.0, 1.0]}},
+            {"id": 1, "image": {"embedding": [10.0, 11.0]}},
+        ],
+        schema=schema,
+    )
+    table = mem_db.create_table("nested_vector_default_search", data=data)
+
+    result = table.search([0.0, 1.0]).limit(1).to_list()
+    assert result[0]["id"] == 0
+
+
+def test_search_nested_vector_multiple_candidates(mem_db: DBConnection):
+    schema = pa.schema(
+        [
+            pa.field(
+                "image",
+                pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
+            ),
+            pa.field(
+                "text",
+                pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
+            ),
+        ]
+    )
+    data = pa.Table.from_pylist(
+        [
+            {
+                "image": {"embedding": [0.0, 1.0]},
+                "text": {"embedding": [2.0, 3.0]},
+            }
+        ],
+        schema=schema,
+    )
+    table = mem_db.create_table("nested_vector_multiple_candidates", data=data)
+
+    with pytest.raises(ValueError, match="image.embedding.*text.embedding"):
+        table.search([0.0, 1.0]).limit(1).to_arrow()
+
+
+def test_search_nested_vector_no_candidates(mem_db: DBConnection):
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int32()),
+            pa.field("metadata", pa.struct([pa.field("label", pa.string())])),
+        ]
+    )
+    data = pa.Table.from_pylist(
+        [{"id": 0, "metadata": {"label": "cat"}}],
+        schema=schema,
+    )
+    table = mem_db.create_table("nested_vector_no_candidates", data=data)
+
+    with pytest.raises(ValueError, match="no vector column"):
+        table.search([0.0, 1.0]).limit(1).to_arrow()
+
+
 def test_compact_cleanup(tmp_db: DBConnection):
    pytest.importorskip("lance")
    table = tmp_db.create_table(