fix: support LargeList label list indexes (#3529)

## Summary

This PR extends nested-field regression coverage across Rust
local/remote, Python sync/async, and Node so canonical escaped paths
stay consistent across scalar, vector, and FTS index lifecycle behavior.

It also aligns LanceDB's LabelList type gate with Lance by accepting
`LargeList<primitive>` columns while keeping `List<Struct<...>>`
unsupported until Lance defines stable membership semantics for struct
labels.

Part of #3406.
This commit is contained in:
Xuanwo
2026-06-10 23:53:56 +08:00
committed by GitHub
parent 9c12fb6437
commit 566b67a634
6 changed files with 638 additions and 20 deletions

View File

@@ -113,8 +113,14 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
pa.field("user.id", pa.int32()),
]
)
mixed_case_metadata_type = pa.struct([pa.field("userId", pa.int32())])
escaped_metadata_type = pa.struct([pa.field("user-id", pa.int32())])
literal_type = pa.struct([pa.field("a.b", pa.int32())])
data = pa.Table.from_arrays(
[
pa.array([1, 2, 3], type=pa.int32()),
pa.array([1, 2, 3], type=pa.int32()),
pa.array([1, 2, 3], type=pa.int32()),
pa.array([1, 2, 3], type=pa.int32()),
pa.array(
[
@@ -124,25 +130,67 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
],
type=metadata_type,
),
pa.array(
[{"userId": 10}, {"userId": 20}, {"userId": 30}],
type=mixed_case_metadata_type,
),
pa.array(
[{"user-id": 10}, {"user-id": 20}, {"user-id": 30}],
type=escaped_metadata_type,
),
pa.array(
[{"a.b": 10}, {"a.b": 20}, {"a.b": 30}],
type=literal_type,
),
],
names=[
"rowId",
"row-id",
"userId",
"user_id",
"metadata",
"MetaData",
"meta-data",
"literal",
],
names=["user_id", "metadata"],
)
table = await db_async.create_table("nested_scalar_index", data)
await table.create_index("user_id", config=BTree(), name="top_user_id_idx")
await table.create_index("rowId", config=BTree(), name="row_id_idx")
await table.create_index("`row-id`", config=BTree(), name="row_dash_id_idx")
await table.create_index("userId", config=BTree(), name="top_user_id_idx")
await table.create_index("user_id", config=BTree(), name="top_snake_user_id_idx")
await table.create_index(
"metadata.user_id", config=BTree(), name="nested_user_id_idx"
)
await table.create_index(
"metadata.`user.id`", config=BTree(), name="escaped_user_id_idx"
)
await table.create_index(
"MetaData.userId", config=BTree(), name="mixed_case_metadata_user_id_idx"
)
await table.create_index(
"`meta-data`.`user-id`", config=BTree(), name="escaped_names_idx"
)
await table.create_index("literal.`a.b`", config=BTree(), name="literal_dot_idx")
columns_by_name = {
index.name: index.columns for index in await table.list_indices()
}
assert columns_by_name["top_user_id_idx"] == ["user_id"]
assert columns_by_name["row_id_idx"] == ["rowId"]
assert columns_by_name["row_dash_id_idx"] == ["`row-id`"]
assert columns_by_name["top_user_id_idx"] == ["userId"]
assert columns_by_name["top_snake_user_id_idx"] == ["user_id"]
assert columns_by_name["nested_user_id_idx"] == ["metadata.user_id"]
assert columns_by_name["escaped_user_id_idx"] == ["metadata.`user.id`"]
assert columns_by_name["mixed_case_metadata_user_id_idx"] == ["MetaData.userId"]
assert columns_by_name["escaped_names_idx"] == ["`meta-data`.`user-id`"]
assert columns_by_name["literal_dot_idx"] == ["literal.`a.b`"]
for index_name in columns_by_name:
stats = await table.index_stats(index_name)
assert stats is not None
assert stats.num_indexed_rows == 3
@pytest.mark.asyncio
@@ -189,6 +237,51 @@ async def test_create_label_list_index(some_table: AsyncTable):
await some_table.create_index("tags", config=LabelList())
indices = await some_table.list_indices()
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan()
assert "ScalarIndexQuery" in plan
@pytest.mark.asyncio
async def test_create_large_list_label_list_index(db_async):
data = pa.Table.from_pydict(
{"tags": [[f"tag{i % 2}", "shared"] for i in range(16)]},
schema=pa.schema([pa.field("tags", pa.large_list(pa.string()))]),
)
table = await db_async.create_table("large_list_label_list_index", data)
await table.create_index("tags", config=LabelList())
indices = await table.list_indices()
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
plan = await table.query().where("array_has(tags, 'shared')").explain_plan()
assert "ScalarIndexQuery" in plan
@pytest.mark.asyncio
async def test_create_label_list_index_rejects_list_struct(db_async):
item_type = pa.struct(
[
pa.field("tag", pa.string()),
pa.field(
"metadata",
pa.struct([pa.field("userId", pa.string())]),
),
]
)
data = pa.Table.from_pylist(
[
{
"items": [
{"tag": "tag0", "metadata": {"userId": "user0"}},
{"tag": "shared", "metadata": {"userId": "user1"}},
]
}
],
schema=pa.schema([pa.field("items", pa.list_(item_type))]),
)
table = await db_async.create_table("list_struct_label_list_index", data)
with pytest.raises(Exception, match="LabelList index cannot be created"):
await table.create_index("items", config=LabelList())
@pytest.mark.asyncio

View File

@@ -2399,18 +2399,32 @@ def test_create_scalar_index(mem_db: DBConnection):
def test_create_index_nested_field_paths(mem_db: DBConnection):
schema = pa.schema(
[
pa.field("rowId", pa.int32()),
pa.field("row-id", pa.int32()),
pa.field("userId", pa.int32()),
pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])),
pa.field("MetaData", pa.struct([pa.field("userId", pa.int32())])),
pa.field(
"image",
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
),
pa.field("payload", pa.struct([pa.field("text", pa.string())])),
pa.field("meta-data", pa.struct([pa.field("user-id", pa.int32())])),
pa.field("literal", pa.struct([pa.field("a.b", pa.int32())])),
]
)
data = pa.Table.from_pylist(
[
{
"rowId": i,
"row-id": i,
"userId": i,
"metadata": {"user_id": i},
"MetaData": {"userId": i},
"image": {"embedding": [float(i), float(i + 1)]},
"payload": {"text": f"document {i}"},
"meta-data": {"user-id": i},
"literal": {"a.b": i},
}
for i in range(256)
],
@@ -2418,19 +2432,37 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
)
table = mem_db.create_table("nested_index_paths", data=data)
table.create_scalar_index("rowId", name="row_id_idx")
table.create_scalar_index("`row-id`", name="row_dash_id_idx")
table.create_scalar_index("userId", name="top_user_id_idx")
table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx")
table.create_scalar_index("MetaData.userId", name="mixed_case_metadata_user_id_idx")
table.create_scalar_index("`meta-data`.`user-id`", name="escaped_names_idx")
table.create_scalar_index("literal.`a.b`", name="literal_dot_idx")
table.create_index(
vector_column_name="image.embedding",
num_partitions=1,
num_sub_vectors=1,
name="image_embedding_idx",
)
table.create_fts_index("payload.text", with_position=False, name="payload_text_idx")
indices = sorted(table.list_indices(), key=lambda idx: idx.name)
assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [
("escaped_names_idx", "BTree", ["`meta-data`.`user-id`"]),
("image_embedding_idx", "IvfPq", ["image.embedding"]),
("literal_dot_idx", "BTree", ["literal.`a.b`"]),
("metadata_user_id_idx", "BTree", ["metadata.user_id"]),
("mixed_case_metadata_user_id_idx", "BTree", ["MetaData.userId"]),
("payload_text_idx", "FTS", ["payload.text"]),
("row_dash_id_idx", "BTree", ["`row-id`"]),
("row_id_idx", "BTree", ["rowId"]),
("top_user_id_idx", "BTree", ["userId"]),
]
for index in indices:
stats = table.index_stats(index.name)
assert stats is not None
assert stats.num_indexed_rows == 256
vector_results = (
table.search([0.0, 1.0], vector_column_name="image.embedding")
@@ -2448,6 +2480,14 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
assert len(filtered_results) == 1
assert filtered_results[0]["metadata"]["user_id"] == 42
escaped_results = table.search().where("`row-id` = 43").limit(1).to_list()
assert len(escaped_results) == 1
assert escaped_results[0]["row-id"] == 43
fts_results = table.search("document 44", query_type="fts").limit(1).to_list()
assert len(fts_results) == 1
assert fts_results[0]["payload"]["text"] == "document 44"
def test_empty_query(mem_db: DBConnection):
table = mem_db.create_table(