mirror of
https://github.com/lancedb/lancedb.git
synced 2026-07-03 02:50:41 +00:00
fix: support LargeList label list indexes (#3529)
## Summary This PR extends nested-field regression coverage across Rust local/remote, Python sync/async, and Node so canonical escaped paths stay consistent across scalar, vector, and FTS index lifecycle behavior. It also aligns LanceDB's LabelList type gate with Lance by accepting `LargeList<primitive>` columns while keeping `List<Struct<...>>` unsupported until Lance defines stable membership semantics for struct labels. Part of #3406.
This commit is contained in:
@@ -113,8 +113,14 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
|
||||
pa.field("user.id", pa.int32()),
|
||||
]
|
||||
)
|
||||
mixed_case_metadata_type = pa.struct([pa.field("userId", pa.int32())])
|
||||
escaped_metadata_type = pa.struct([pa.field("user-id", pa.int32())])
|
||||
literal_type = pa.struct([pa.field("a.b", pa.int32())])
|
||||
data = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array(
|
||||
[
|
||||
@@ -124,25 +130,67 @@ async def test_create_nested_scalar_index_lists_canonical_paths(db_async):
|
||||
],
|
||||
type=metadata_type,
|
||||
),
|
||||
pa.array(
|
||||
[{"userId": 10}, {"userId": 20}, {"userId": 30}],
|
||||
type=mixed_case_metadata_type,
|
||||
),
|
||||
pa.array(
|
||||
[{"user-id": 10}, {"user-id": 20}, {"user-id": 30}],
|
||||
type=escaped_metadata_type,
|
||||
),
|
||||
pa.array(
|
||||
[{"a.b": 10}, {"a.b": 20}, {"a.b": 30}],
|
||||
type=literal_type,
|
||||
),
|
||||
],
|
||||
names=[
|
||||
"rowId",
|
||||
"row-id",
|
||||
"userId",
|
||||
"user_id",
|
||||
"metadata",
|
||||
"MetaData",
|
||||
"meta-data",
|
||||
"literal",
|
||||
],
|
||||
names=["user_id", "metadata"],
|
||||
)
|
||||
table = await db_async.create_table("nested_scalar_index", data)
|
||||
|
||||
await table.create_index("user_id", config=BTree(), name="top_user_id_idx")
|
||||
await table.create_index("rowId", config=BTree(), name="row_id_idx")
|
||||
await table.create_index("`row-id`", config=BTree(), name="row_dash_id_idx")
|
||||
await table.create_index("userId", config=BTree(), name="top_user_id_idx")
|
||||
await table.create_index("user_id", config=BTree(), name="top_snake_user_id_idx")
|
||||
await table.create_index(
|
||||
"metadata.user_id", config=BTree(), name="nested_user_id_idx"
|
||||
)
|
||||
await table.create_index(
|
||||
"metadata.`user.id`", config=BTree(), name="escaped_user_id_idx"
|
||||
)
|
||||
await table.create_index(
|
||||
"MetaData.userId", config=BTree(), name="mixed_case_metadata_user_id_idx"
|
||||
)
|
||||
await table.create_index(
|
||||
"`meta-data`.`user-id`", config=BTree(), name="escaped_names_idx"
|
||||
)
|
||||
await table.create_index("literal.`a.b`", config=BTree(), name="literal_dot_idx")
|
||||
|
||||
columns_by_name = {
|
||||
index.name: index.columns for index in await table.list_indices()
|
||||
}
|
||||
assert columns_by_name["top_user_id_idx"] == ["user_id"]
|
||||
assert columns_by_name["row_id_idx"] == ["rowId"]
|
||||
assert columns_by_name["row_dash_id_idx"] == ["`row-id`"]
|
||||
assert columns_by_name["top_user_id_idx"] == ["userId"]
|
||||
assert columns_by_name["top_snake_user_id_idx"] == ["user_id"]
|
||||
assert columns_by_name["nested_user_id_idx"] == ["metadata.user_id"]
|
||||
assert columns_by_name["escaped_user_id_idx"] == ["metadata.`user.id`"]
|
||||
assert columns_by_name["mixed_case_metadata_user_id_idx"] == ["MetaData.userId"]
|
||||
assert columns_by_name["escaped_names_idx"] == ["`meta-data`.`user-id`"]
|
||||
assert columns_by_name["literal_dot_idx"] == ["literal.`a.b`"]
|
||||
|
||||
for index_name in columns_by_name:
|
||||
stats = await table.index_stats(index_name)
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -189,6 +237,51 @@ async def test_create_label_list_index(some_table: AsyncTable):
|
||||
await some_table.create_index("tags", config=LabelList())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||
plan = await some_table.query().where("array_has(tags, 'tag0')").explain_plan()
|
||||
assert "ScalarIndexQuery" in plan
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_large_list_label_list_index(db_async):
|
||||
data = pa.Table.from_pydict(
|
||||
{"tags": [[f"tag{i % 2}", "shared"] for i in range(16)]},
|
||||
schema=pa.schema([pa.field("tags", pa.large_list(pa.string()))]),
|
||||
)
|
||||
table = await db_async.create_table("large_list_label_list_index", data)
|
||||
|
||||
await table.create_index("tags", config=LabelList())
|
||||
indices = await table.list_indices()
|
||||
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
||||
plan = await table.query().where("array_has(tags, 'shared')").explain_plan()
|
||||
assert "ScalarIndexQuery" in plan
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_label_list_index_rejects_list_struct(db_async):
|
||||
item_type = pa.struct(
|
||||
[
|
||||
pa.field("tag", pa.string()),
|
||||
pa.field(
|
||||
"metadata",
|
||||
pa.struct([pa.field("userId", pa.string())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
data = pa.Table.from_pylist(
|
||||
[
|
||||
{
|
||||
"items": [
|
||||
{"tag": "tag0", "metadata": {"userId": "user0"}},
|
||||
{"tag": "shared", "metadata": {"userId": "user1"}},
|
||||
]
|
||||
}
|
||||
],
|
||||
schema=pa.schema([pa.field("items", pa.list_(item_type))]),
|
||||
)
|
||||
table = await db_async.create_table("list_struct_label_list_index", data)
|
||||
|
||||
with pytest.raises(Exception, match="LabelList index cannot be created"):
|
||||
await table.create_index("items", config=LabelList())
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -2399,18 +2399,32 @@ def test_create_scalar_index(mem_db: DBConnection):
|
||||
def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
schema = pa.schema(
|
||||
[
|
||||
pa.field("rowId", pa.int32()),
|
||||
pa.field("row-id", pa.int32()),
|
||||
pa.field("userId", pa.int32()),
|
||||
pa.field("metadata", pa.struct([pa.field("user_id", pa.int32())])),
|
||||
pa.field("MetaData", pa.struct([pa.field("userId", pa.int32())])),
|
||||
pa.field(
|
||||
"image",
|
||||
pa.struct([pa.field("embedding", pa.list_(pa.float32(), 2))]),
|
||||
),
|
||||
pa.field("payload", pa.struct([pa.field("text", pa.string())])),
|
||||
pa.field("meta-data", pa.struct([pa.field("user-id", pa.int32())])),
|
||||
pa.field("literal", pa.struct([pa.field("a.b", pa.int32())])),
|
||||
]
|
||||
)
|
||||
data = pa.Table.from_pylist(
|
||||
[
|
||||
{
|
||||
"rowId": i,
|
||||
"row-id": i,
|
||||
"userId": i,
|
||||
"metadata": {"user_id": i},
|
||||
"MetaData": {"userId": i},
|
||||
"image": {"embedding": [float(i), float(i + 1)]},
|
||||
"payload": {"text": f"document {i}"},
|
||||
"meta-data": {"user-id": i},
|
||||
"literal": {"a.b": i},
|
||||
}
|
||||
for i in range(256)
|
||||
],
|
||||
@@ -2418,19 +2432,37 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
)
|
||||
table = mem_db.create_table("nested_index_paths", data=data)
|
||||
|
||||
table.create_scalar_index("rowId", name="row_id_idx")
|
||||
table.create_scalar_index("`row-id`", name="row_dash_id_idx")
|
||||
table.create_scalar_index("userId", name="top_user_id_idx")
|
||||
table.create_scalar_index("metadata.user_id", name="metadata_user_id_idx")
|
||||
table.create_scalar_index("MetaData.userId", name="mixed_case_metadata_user_id_idx")
|
||||
table.create_scalar_index("`meta-data`.`user-id`", name="escaped_names_idx")
|
||||
table.create_scalar_index("literal.`a.b`", name="literal_dot_idx")
|
||||
table.create_index(
|
||||
vector_column_name="image.embedding",
|
||||
num_partitions=1,
|
||||
num_sub_vectors=1,
|
||||
name="image_embedding_idx",
|
||||
)
|
||||
table.create_fts_index("payload.text", with_position=False, name="payload_text_idx")
|
||||
|
||||
indices = sorted(table.list_indices(), key=lambda idx: idx.name)
|
||||
assert [(idx.name, idx.index_type, idx.columns) for idx in indices] == [
|
||||
("escaped_names_idx", "BTree", ["`meta-data`.`user-id`"]),
|
||||
("image_embedding_idx", "IvfPq", ["image.embedding"]),
|
||||
("literal_dot_idx", "BTree", ["literal.`a.b`"]),
|
||||
("metadata_user_id_idx", "BTree", ["metadata.user_id"]),
|
||||
("mixed_case_metadata_user_id_idx", "BTree", ["MetaData.userId"]),
|
||||
("payload_text_idx", "FTS", ["payload.text"]),
|
||||
("row_dash_id_idx", "BTree", ["`row-id`"]),
|
||||
("row_id_idx", "BTree", ["rowId"]),
|
||||
("top_user_id_idx", "BTree", ["userId"]),
|
||||
]
|
||||
for index in indices:
|
||||
stats = table.index_stats(index.name)
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == 256
|
||||
|
||||
vector_results = (
|
||||
table.search([0.0, 1.0], vector_column_name="image.embedding")
|
||||
@@ -2448,6 +2480,14 @@ def test_create_index_nested_field_paths(mem_db: DBConnection):
|
||||
assert len(filtered_results) == 1
|
||||
assert filtered_results[0]["metadata"]["user_id"] == 42
|
||||
|
||||
escaped_results = table.search().where("`row-id` = 43").limit(1).to_list()
|
||||
assert len(escaped_results) == 1
|
||||
assert escaped_results[0]["row-id"] == 43
|
||||
|
||||
fts_results = table.search("document 44", query_type="fts").limit(1).to_list()
|
||||
assert len(fts_results) == 1
|
||||
assert fts_results[0]["payload"]["text"] == "document 44"
|
||||
|
||||
|
||||
def test_empty_query(mem_db: DBConnection):
|
||||
table = mem_db.create_table(
|
||||
|
||||
Reference in New Issue
Block a user