fix: support nested field paths in native index creation (#3408)

Native index creation was resolving requested columns through top-level
Arrow schema lookup before handing the request to Lance, which rejected
nested paths and could collapse a nested field to its leaf name. This PR
resolves index targets with Lance field-path semantics, passes the
canonical path through to Lance, and reports indexed columns from field
ids as canonical full paths.

This also removes the Python native FTS guard that rejected dotted paths
so scalar, vector, and FTS index creation share the same nested-field
contract. Related to #3402.
This commit is contained in:
Xuanwo
2026-05-20 11:15:15 +08:00
committed by GitHub
parent 049b0c8f09
commit 5bfde47a8e
4 changed files with 308 additions and 18 deletions

View File

@@ -2542,11 +2542,6 @@ class LanceTable(Table):
"at a time. To search over multiple text fields, create a "
"separate FTS index for each field."
)
if "." in field_names:
raise ValueError(
"Native FTS indexes can only be created on top-level fields. "
f"Received nested field path: {field_names!r}."
)
if tokenizer_name is None:
tokenizer_configs = {

View File

@@ -563,8 +563,19 @@ def test_create_index_multiple_columns(tmp_path, table):
def test_nested_schema(tmp_path, table):
with pytest.raises(ValueError, match="top-level fields"):
table.create_fts_index("nested.text")
table.create_fts_index("nested.text")
indices = table.list_indices()
assert len(indices) == 1
assert indices[0].index_type == "FTS"
assert indices[0].columns == ["nested.text"]
results = (
table.search("puppy", query_type="fts", fts_columns="nested.text")
.limit(5)
.to_list()
)
assert len(results) > 0
assert all("puppy" in row["nested"]["text"] for row in results)
def test_search_index_with_filter(table):

View File

@@ -1890,6 +1890,55 @@ def test_create_scalar_index(mem_db: DBConnection):
assert scalar_index.name == "custom_y_index"
def test_create_index_nested_field_paths(mem_db: DBConnection):
schema = pa.schema(
[
pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])),
pa.field(
"image",
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
),
]
)
data = pa.Table.from_pylist(
[
{
"metadata": {"user_id": i},
"image": {"embedding": [float(i), float(i + 1)]},
}
for i in range(256)
],
schema=schema,
)
table = mem_db.create_table("nested_index_paths", data=data)
table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx")
table.create_index(
vector_column_name="image.embedding",
num_partitions=1,
num_sub_vectors=1,
name="image_embedding_idx",
)
indices = sorted(table.list_indices(), key=lambda idx: idx.name)
assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [
("image_embedding_idx", "IvfPq", ["image.embedding"]),
("metadata_user_id_idx", "BTree", ["metadata.user_id"]),
]
vector_results = (
table.search([0.0, 1.0], vector_column_name="image.embedding")
.limit(1)
.to_list()
)
assert len(vector_results) == 1
assert vector_results[0]["metadata"]["user_id"] == 0
filtered_results = table.search().where("metadata.user_id = 42").limit(1).to_list()
assert len(filtered_results) == 1
assert filtered_results[0]["metadata"]["user_id"] == 42
def test_empty_query(mem_db: DBConnection):
table = mem_db.create_table(
"my_table",