From 04480c274aad3e8a436ab4bbb07cb3f46eddd53b Mon Sep 17 00:00:00 2001 From: nuthalapativarun Date: Thu, 11 Jun 2026 08:06:04 -0700 Subject: [PATCH] test(python): add nested field regression matrix tests (#3518) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Closes #3406 Add a regression matrix in `python/python/tests/test_nested_fields.py` that exercises the full nested field index lifecycle for both the sync and async Python table APIs. The tests will fail if any implementation regresses to leaf-only field names in `list_indices`, `index_stats`, search, or filter results. ## Test scenarios covered **Index types:** BTree scalar, IvfPq vector, FTS **Field-name edge cases (per acceptance criteria):** - `rowId` — camelCase top-level field - `` `row-id` `` — hyphenated top-level field (escaped) - `parent.`\``leaf.name`\`` ` — struct leaf whose name contains a literal dot - `MetaData.userId` — mixed-case nested path - `` `meta-data`.`user-id` `` — hyphenated struct with hyphenated leaf **Lifecycle operations per index type:** - `create_index` / `create_scalar_index` / `create_fts_index` - `list_indices` → verify canonical full dotted path (not leaf name) - `index_stats` → verify row count and index type - Filtered scan (`WHERE nested.field = value`) - Vector search via nested embedding column - FTS search via nested text column - `add` (append) then re-check index listing - `optimize` then re-check index listing **Both sync and async APIs** are covered in parallel test classes. ## Notes Lance forbids top-level field names that contain a literal `.`, so the `` `a.b` `` acceptance-criterion variant is exercised as a *struct leaf* field (`parent.`\``leaf.name`\``) rather than a top-level column. --- python/python/tests/test_nested_fields.py | 686 ++++++++++++++++++++++ 1 file changed, 686 insertions(+) create mode 100644 python/python/tests/test_nested_fields.py diff --git a/python/python/tests/test_nested_fields.py b/python/python/tests/test_nested_fields.py new file mode 100644 index 000000000..0b264e999 --- /dev/null +++ b/python/python/tests/test_nested_fields.py @@ -0,0 +1,686 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The LanceDB Authors + +"""Regression matrix for nested field support across LanceDB Python APIs. + +Covers the lifecycle described in lancedb/lancedb#3406: + - Nested scalar, vector, and FTS index creation with full dotted paths + - list_indices / index_stats return canonical full paths (not leaf names) + - search, filter, append, optimize behaviour + - Field-name edge cases: mixed case, literal-dot field names, same-name leaves + - Both sync and async Python table APIs + +The matrix uses the following field-name variants from the acceptance criteria: + - rowId (camelCase top-level) + - `row-id` (hyphenated top-level, escaped) + - parent.`leaf.name` (struct leaf whose name contains a literal dot) + - MetaData.userId (mixed-case nested path) + - `meta-data`.`user-id` (hyphenated struct with hyphenated leaf) + +Note: Lance forbids top-level field names that contain a '.', so the literal-dot +edge case is exercised via a struct leaf field (parent.`leaf.name`) instead. +""" + +from datetime import timedelta + +import pyarrow as pa +import pytest +import pytest_asyncio + +import lancedb +from lancedb.db import AsyncConnection, DBConnection +from lancedb.index import BTree, FTS, IvfPq + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +DIM = 8 +# IvfPq requires at least num_partitions * 256 rows by default; keeping rows +# small means we must drop num_sub_vectors and num_partitions very low. +NROWS = 256 + + +def _vec(row: int) -> list: + return [float((row * DIM + i) % 256) for i in range(DIM)] + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def sync_db(tmp_path) -> DBConnection: + return lancedb.connect(tmp_path) + + +@pytest_asyncio.fixture +async def async_db(tmp_path) -> AsyncConnection: + return await lancedb.connect_async( + tmp_path, read_consistency_interval=timedelta(seconds=0) + ) + + +# --------------------------------------------------------------------------- +# Schema / data builders +# --------------------------------------------------------------------------- + + +def _nested_scalar_schema() -> pa.Schema: + """Schema with nested scalar fields covering the acceptance-criteria names. + + Top-level columns: + - rowId int32 (camelCase top-level) + - row-id int32 (hyphenated top-level name) + - MetaData struct{userId int32} (mixed-case nested path) + - meta-data struct{user-id int32} (hyphenated struct + hyphenated leaf) + + Lance disallows top-level field names that contain '.' (e.g. a field + literally named 'a.b'), so that edge case is tested separately using + _literal_dot_schema() below. + """ + return pa.schema( + [ + pa.field("rowId", pa.int32()), + pa.field("row-id", pa.int32()), + pa.field( + "MetaData", + pa.struct([pa.field("userId", pa.int32())]), + ), + pa.field( + "meta-data", + pa.struct([pa.field("user-id", pa.int32())]), + ), + ] + ) + + +def _nested_scalar_data(nrows: int = NROWS) -> pa.Table: + schema = _nested_scalar_schema() + return pa.table( + { + "rowId": pa.array(list(range(nrows)), pa.int32()), + "row-id": pa.array(list(range(nrows)), pa.int32()), + "MetaData": pa.array( + [{"userId": i} for i in range(nrows)], + type=pa.struct([pa.field("userId", pa.int32())]), + ), + "meta-data": pa.array( + [{"user-id": i} for i in range(nrows)], + type=pa.struct([pa.field("user-id", pa.int32())]), + ), + }, + schema=schema, + ) + + +def _literal_dot_schema() -> pa.Schema: + """Schema where a struct *leaf* field is named with a literal dot. + + The path used in the index API is ``parent.`leaf.name` ``. + """ + return pa.schema( + [ + pa.field("id", pa.int32()), + pa.field( + "parent", + pa.struct([pa.field("leaf.name", pa.int32())]), + ), + ] + ) + + +def _literal_dot_data(nrows: int = NROWS) -> pa.Table: + parent_type = pa.struct([pa.field("leaf.name", pa.int32())]) + return pa.table( + { + "id": pa.array(list(range(nrows)), pa.int32()), + "parent": pa.array( + [{"leaf.name": i} for i in range(nrows)], + type=parent_type, + ), + }, + schema=_literal_dot_schema(), + ) + + +def _same_leaf_schema() -> pa.Schema: + return pa.schema( + [ + pa.field("StructA", pa.struct([pa.field("userId", pa.int32())])), + pa.field("StructB", pa.struct([pa.field("userId", pa.int32())])), + ] + ) + + +def _same_leaf_data(nrows: int = NROWS) -> pa.Table: + t = pa.struct([pa.field("userId", pa.int32())]) + return pa.table( + { + "StructA": pa.array([{"userId": i} for i in range(nrows)], type=t), + "StructB": pa.array([{"userId": i * 10} for i in range(nrows)], type=t), + }, + schema=_same_leaf_schema(), + ) + + +def _nested_vector_schema() -> pa.Schema: + return pa.schema( + [ + pa.field("id", pa.int32()), + pa.field( + "image", + pa.struct([pa.field("embedding", pa.list_(pa.float32(), DIM))]), + ), + pa.field( + "MetaData", + pa.struct([pa.field("userId", pa.int32())]), + ), + ] + ) + + +def _nested_vector_data(nrows: int = NROWS) -> pa.Table: + embedding_type = pa.list_(pa.float32(), DIM) + image_type = pa.struct([pa.field("embedding", embedding_type)]) + meta_type = pa.struct([pa.field("userId", pa.int32())]) + return pa.table( + { + "id": pa.array(list(range(nrows)), pa.int32()), + "image": pa.array( + [{"embedding": _vec(i)} for i in range(nrows)], + type=image_type, + ), + "MetaData": pa.array( + [{"userId": i} for i in range(nrows)], + type=meta_type, + ), + }, + schema=_nested_vector_schema(), + ) + + +def _nested_fts_schema() -> pa.Schema: + return pa.schema( + [ + pa.field("id", pa.int32()), + pa.field( + "payload", + pa.struct([pa.field("text", pa.utf8())]), + ), + pa.field( + "MetaData", + pa.struct([pa.field("userId", pa.int32())]), + ), + ] + ) + + +def _nested_fts_data(nrows: int = NROWS) -> pa.Table: + words = ["alpha", "bravo", "charlie", "delta", "echo"] + payload_type = pa.struct([pa.field("text", pa.utf8())]) + meta_type = pa.struct([pa.field("userId", pa.int32())]) + return pa.table( + { + "id": pa.array(list(range(nrows)), pa.int32()), + "payload": pa.array( + [{"text": words[i % len(words)]} for i in range(nrows)], + type=payload_type, + ), + "MetaData": pa.array( + [{"userId": i} for i in range(nrows)], + type=meta_type, + ), + }, + schema=_nested_fts_schema(), + ) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _columns_by_name_sync(tbl) -> dict: + return {idx.name: idx.columns for idx in tbl.list_indices()} + + +async def _columns_by_name_async(tbl) -> dict: + return {idx.name: idx.columns for idx in await tbl.list_indices()} + + +# =========================================================================== +# SYNC TESTS +# =========================================================================== +# +# The sync LanceTable API uses: +# - create_scalar_index(column, ...) for scalar (BTree/Bitmap/LabelList) indices +# - create_fts_index(column, ...) for full-text-search indices +# - create_index(...) for vector indices (older positional API) +# =========================================================================== + + +class TestNestedScalarIndexSync: + """Sync regression matrix for nested scalar (BTree) indices.""" + + def test_top_level_camelcase_field(self, sync_db): + """list_indices must return the full camelCase field name.""" + tbl = sync_db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index("rowId", index_type="BTREE", name="rowid_idx") + col_map = _columns_by_name_sync(tbl) + assert col_map["rowid_idx"] == ["rowId"], ( + "list_indices must return 'rowId', not a truncated leaf name" + ) + + def test_top_level_hyphenated_field_escaped(self, sync_db): + """Top-level field 'row-id' (hyphenated) accessed via escaped path.""" + tbl = sync_db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index("`row-id`", index_type="BTREE", name="rowid_hyph_idx") + col_map = _columns_by_name_sync(tbl) + assert col_map["rowid_hyph_idx"] == ["`row-id`"], ( + "list_indices must return escaped path '`row-id`'" + ) + + def test_struct_leaf_literal_dot_field_escaped(self, sync_db): + """Struct leaf with a literal-dot name: parent.`leaf.name`. + + The index listing must use the full escaped path, not just the leaf. + """ + tbl = sync_db.create_table("t", _literal_dot_data()) + tbl.create_scalar_index( + "parent.`leaf.name`", index_type="BTREE", name="leaf_dot_idx" + ) + col_map = _columns_by_name_sync(tbl) + assert col_map["leaf_dot_idx"] == ["parent.`leaf.name`"], ( + "list_indices must return 'parent.`leaf.name`', not just '`leaf.name`'" + ) + + def test_nested_mixed_case_path(self, sync_db): + """Nested path MetaData.userId (mixed case) must appear as full path.""" + tbl = sync_db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index( + "MetaData.userId", index_type="BTREE", name="metadata_userid_idx" + ) + col_map = _columns_by_name_sync(tbl) + assert col_map["metadata_userid_idx"] == ["MetaData.userId"], ( + "list_indices must return 'MetaData.userId', not leaf 'userId'" + ) + + def test_nested_hyphenated_path_escaped(self, sync_db): + """`meta-data`.`user-id` path with both parts escaped.""" + tbl = sync_db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index( + "`meta-data`.`user-id`", index_type="BTREE", name="metauid_idx" + ) + col_map = _columns_by_name_sync(tbl) + assert col_map["metauid_idx"] == ["`meta-data`.`user-id`"], ( + "list_indices must return '`meta-data`.`user-id`', not 'user-id'" + ) + + def test_filter_on_nested_mixed_case(self, sync_db): + """WHERE filter on a nested dotted path works after index creation.""" + tbl = sync_db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index( + "MetaData.userId", index_type="BTREE", name="metadata_userid_idx" + ) + rows = tbl.search().where("MetaData.userId = 5").to_list() + assert len(rows) == 1 + assert rows[0]["MetaData"]["userId"] == 5 + + def test_append_and_list_indices_stable(self, sync_db): + """After appending rows the index listing must remain unchanged.""" + tbl = sync_db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index( + "MetaData.userId", index_type="BTREE", name="meta_uid_idx" + ) + tbl.add(_nested_scalar_data(nrows=4)) + col_map = _columns_by_name_sync(tbl) + assert col_map["meta_uid_idx"] == ["MetaData.userId"] + + def test_optimize_and_list_indices_stable(self, tmp_path): + """After optimize the index listing must still show full paths.""" + db = lancedb.connect(tmp_path / "opt_db") + tbl = db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index( + "MetaData.userId", index_type="BTREE", name="meta_uid_idx" + ) + tbl.add(_nested_scalar_data(nrows=4)) + tbl.optimize() + col_map = _columns_by_name_sync(tbl) + assert col_map["meta_uid_idx"] == ["MetaData.userId"] + + def test_same_name_leaves_are_distinct(self, sync_db): + """Two structs sharing a leaf name must produce distinct index paths.""" + tbl = sync_db.create_table("same_leaf", _same_leaf_data()) + tbl.create_scalar_index( + "StructA.userId", index_type="BTREE", name="a_userid_idx" + ) + tbl.create_scalar_index( + "StructB.userId", index_type="BTREE", name="b_userid_idx" + ) + col_map = _columns_by_name_sync(tbl) + assert col_map["a_userid_idx"] == ["StructA.userId"] + assert col_map["b_userid_idx"] == ["StructB.userId"] + + def test_index_stats_canonical_path(self, sync_db): + """index_stats round-trip: create on nested field, verify row count.""" + tbl = sync_db.create_table("t", _nested_scalar_data()) + tbl.create_scalar_index( + "MetaData.userId", index_type="BTREE", name="meta_uid_idx" + ) + stats = tbl.index_stats("meta_uid_idx") + assert stats is not None + assert stats.index_type == "BTREE" + assert stats.num_indexed_rows == NROWS + + +class TestNestedVectorIndexSync: + """Sync regression matrix for nested vector (IvfPq) indices.""" + + def test_nested_vector_index_full_path(self, sync_db): + """Listing after vector index creation must use the full dotted path.""" + tbl = sync_db.create_table("vt", _nested_vector_data()) + tbl.create_index( + num_partitions=2, + num_sub_vectors=2, + vector_column_name="image.embedding", + name="image_emb_idx", + ) + col_map = _columns_by_name_sync(tbl) + assert col_map["image_emb_idx"] == ["image.embedding"], ( + "list_indices must return 'image.embedding', not leaf 'embedding'" + ) + + def test_nested_vector_search(self, sync_db): + """Vector search on nested embedding field must return results.""" + tbl = sync_db.create_table("vt", _nested_vector_data()) + tbl.create_index( + num_partitions=2, + num_sub_vectors=2, + vector_column_name="image.embedding", + name="image_emb_idx", + ) + results = ( + tbl.search(_vec(0), vector_column_name="image.embedding").limit(5).to_list() + ) + assert len(results) > 0 + + def test_nested_vector_index_stats(self, sync_db): + """index_stats for a nested vector index must reflect correct row count.""" + tbl = sync_db.create_table("vt", _nested_vector_data()) + tbl.create_index( + num_partitions=2, + num_sub_vectors=2, + vector_column_name="image.embedding", + name="image_emb_idx", + ) + stats = tbl.index_stats("image_emb_idx") + assert stats is not None + assert stats.num_indexed_rows == NROWS + + def test_nested_vector_append_optimize(self, tmp_path): + """After append and optimize the vector index listing must be stable.""" + db = lancedb.connect(tmp_path / "vec_opt_db") + tbl = db.create_table("vt", _nested_vector_data()) + tbl.create_index( + num_partitions=2, + num_sub_vectors=2, + vector_column_name="image.embedding", + name="image_emb_idx", + ) + tbl.add(_nested_vector_data(nrows=4)) + tbl.optimize() + col_map = _columns_by_name_sync(tbl) + assert col_map["image_emb_idx"] == ["image.embedding"] + + +class TestNestedFTSIndexSync: + """Sync regression matrix for nested FTS indices.""" + + def test_nested_fts_index_full_path(self, sync_db): + """FTS index on payload.text must be listed with the full path.""" + tbl = sync_db.create_table("ft", _nested_fts_data()) + tbl.create_fts_index("payload.text", name="payload_text_idx") + col_map = _columns_by_name_sync(tbl) + assert col_map["payload_text_idx"] == ["payload.text"], ( + "list_indices must return 'payload.text', not leaf 'text'" + ) + + def test_nested_fts_search(self, sync_db): + """FTS search on a nested text field must return correct results.""" + tbl = sync_db.create_table("ft", _nested_fts_data()) + tbl.create_fts_index("payload.text", name="payload_text_idx") + results = ( + tbl.search("alpha", query_type="fts", fts_columns="payload.text") + .limit(10) + .to_list() + ) + assert len(results) > 0 + assert all(row["payload"]["text"] == "alpha" for row in results) + + def test_nested_fts_append_optimize(self, tmp_path): + """After append and optimize the FTS index listing must be stable.""" + db = lancedb.connect(tmp_path / "fts_opt_db") + tbl = db.create_table("ft", _nested_fts_data()) + tbl.create_fts_index("payload.text", name="payload_text_idx") + tbl.add(_nested_fts_data(nrows=4)) + tbl.optimize() + col_map = _columns_by_name_sync(tbl) + assert col_map["payload_text_idx"] == ["payload.text"] + + +# =========================================================================== +# ASYNC TESTS +# =========================================================================== +# +# The async AsyncTable API uses create_index(column, config=...) uniformly +# for scalar, vector, and FTS indices. +# =========================================================================== + + +class TestNestedScalarIndexAsync: + """Async regression matrix for nested scalar (BTree) indices.""" + + @pytest.mark.asyncio + async def test_top_level_camelcase_field(self, async_db): + """list_indices must return the full camelCase field name.""" + tbl = await async_db.create_table("t", _nested_scalar_data()) + await tbl.create_index("rowId", config=BTree(), name="rowid_idx") + col_map = await _columns_by_name_async(tbl) + assert col_map["rowid_idx"] == ["rowId"] + + @pytest.mark.asyncio + async def test_top_level_hyphenated_field_escaped(self, async_db): + """Hyphenated top-level field accessed via escaped path.""" + tbl = await async_db.create_table("t", _nested_scalar_data()) + await tbl.create_index("`row-id`", config=BTree(), name="rowid_hyph_idx") + col_map = await _columns_by_name_async(tbl) + assert col_map["rowid_hyph_idx"] == ["`row-id`"] + + @pytest.mark.asyncio + async def test_struct_leaf_literal_dot_field_escaped(self, async_db): + """Struct leaf with a literal-dot name: parent.`leaf.name`.""" + tbl = await async_db.create_table("t", _literal_dot_data()) + await tbl.create_index( + "parent.`leaf.name`", config=BTree(), name="leaf_dot_idx" + ) + col_map = await _columns_by_name_async(tbl) + assert col_map["leaf_dot_idx"] == ["parent.`leaf.name`"] + + @pytest.mark.asyncio + async def test_nested_mixed_case_path(self, async_db): + """Mixed-case nested path MetaData.userId must appear as full path.""" + tbl = await async_db.create_table("t", _nested_scalar_data()) + await tbl.create_index( + "MetaData.userId", config=BTree(), name="metadata_userid_idx" + ) + col_map = await _columns_by_name_async(tbl) + assert col_map["metadata_userid_idx"] == ["MetaData.userId"] + + @pytest.mark.asyncio + async def test_nested_hyphenated_path_escaped(self, async_db): + """`meta-data`.`user-id` path with both parts escaped.""" + tbl = await async_db.create_table("t", _nested_scalar_data()) + await tbl.create_index( + "`meta-data`.`user-id`", config=BTree(), name="metauid_idx" + ) + col_map = await _columns_by_name_async(tbl) + assert col_map["metauid_idx"] == ["`meta-data`.`user-id`"] + + @pytest.mark.asyncio + async def test_filter_on_nested_mixed_case(self, async_db): + """WHERE filter on a nested dotted path works after index creation.""" + tbl = await async_db.create_table("t", _nested_scalar_data()) + await tbl.create_index( + "MetaData.userId", config=BTree(), name="metadata_userid_idx" + ) + rows = await tbl.query().where("MetaData.userId = 5").to_list() + assert len(rows) == 1 + assert rows[0]["MetaData"]["userId"] == 5 + + @pytest.mark.asyncio + async def test_index_stats_canonical_path(self, async_db): + """index_stats round-trip: create on nested field, verify stats.""" + tbl = await async_db.create_table("t", _nested_scalar_data()) + await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx") + stats = await tbl.index_stats("meta_uid_idx") + assert stats is not None + assert stats.index_type == "BTREE" + assert stats.num_indexed_rows == NROWS + + @pytest.mark.asyncio + async def test_append_and_list_indices_stable(self, async_db): + """After appending rows the index listing must remain unchanged.""" + tbl = await async_db.create_table("t", _nested_scalar_data()) + await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx") + await tbl.add(_nested_scalar_data(nrows=4)) + col_map = await _columns_by_name_async(tbl) + assert col_map["meta_uid_idx"] == ["MetaData.userId"] + + @pytest.mark.asyncio + async def test_optimize_and_list_indices_stable(self, tmp_path): + """After optimize the index listing must still show full paths.""" + db = await lancedb.connect_async( + tmp_path / "opt_db", read_consistency_interval=timedelta(seconds=0) + ) + tbl = await db.create_table("t", _nested_scalar_data()) + await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx") + await tbl.add(_nested_scalar_data(nrows=4)) + await tbl.optimize() + col_map = await _columns_by_name_async(tbl) + assert col_map["meta_uid_idx"] == ["MetaData.userId"] + + @pytest.mark.asyncio + async def test_same_name_leaves_are_distinct(self, async_db): + """Two structs sharing a leaf name must produce distinct index paths.""" + tbl = await async_db.create_table("same_leaf", _same_leaf_data()) + await tbl.create_index("StructA.userId", config=BTree(), name="a_userid_idx") + await tbl.create_index("StructB.userId", config=BTree(), name="b_userid_idx") + col_map = await _columns_by_name_async(tbl) + assert col_map["a_userid_idx"] == ["StructA.userId"] + assert col_map["b_userid_idx"] == ["StructB.userId"] + + +class TestNestedVectorIndexAsync: + """Async regression matrix for nested vector (IvfPq) indices.""" + + @pytest.mark.asyncio + async def test_nested_vector_index_full_path(self, async_db): + """Listing after vector index creation must use the full dotted path.""" + tbl = await async_db.create_table("vt", _nested_vector_data()) + await tbl.create_index( + "image.embedding", + config=IvfPq(num_partitions=2, num_sub_vectors=2), + name="image_emb_idx", + ) + col_map = await _columns_by_name_async(tbl) + assert col_map["image_emb_idx"] == ["image.embedding"] + + @pytest.mark.asyncio + async def test_nested_vector_search(self, async_db): + """Vector search on nested embedding field must return results.""" + tbl = await async_db.create_table("vt", _nested_vector_data()) + await tbl.create_index( + "image.embedding", + config=IvfPq(num_partitions=2, num_sub_vectors=2), + name="image_emb_idx", + ) + results = ( + await tbl.query() + .nearest_to(_vec(0)) + .column("image.embedding") + .limit(5) + .to_list() + ) + assert len(results) > 0 + + @pytest.mark.asyncio + async def test_nested_vector_index_stats(self, async_db): + """index_stats for a nested vector index must reflect correct row count.""" + tbl = await async_db.create_table("vt", _nested_vector_data()) + await tbl.create_index( + "image.embedding", + config=IvfPq(num_partitions=2, num_sub_vectors=2), + name="image_emb_idx", + ) + stats = await tbl.index_stats("image_emb_idx") + assert stats is not None + assert stats.num_indexed_rows == NROWS + + @pytest.mark.asyncio + async def test_nested_vector_append_optimize(self, tmp_path): + """After append and optimize the vector index listing must be stable.""" + db = await lancedb.connect_async( + tmp_path / "vec_opt_db", read_consistency_interval=timedelta(seconds=0) + ) + tbl = await db.create_table("vt", _nested_vector_data()) + await tbl.create_index( + "image.embedding", + config=IvfPq(num_partitions=2, num_sub_vectors=2), + name="image_emb_idx", + ) + await tbl.add(_nested_vector_data(nrows=4)) + await tbl.optimize() + col_map = await _columns_by_name_async(tbl) + assert col_map["image_emb_idx"] == ["image.embedding"] + + +class TestNestedFTSIndexAsync: + """Async regression matrix for nested FTS indices.""" + + @pytest.mark.asyncio + async def test_nested_fts_index_full_path(self, async_db): + """FTS index on payload.text must be listed with the full path.""" + tbl = await async_db.create_table("ft", _nested_fts_data()) + await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx") + col_map = await _columns_by_name_async(tbl) + assert col_map["payload_text_idx"] == ["payload.text"] + + @pytest.mark.asyncio + async def test_nested_fts_search(self, async_db): + """FTS search on a nested text field must return correct results.""" + tbl = await async_db.create_table("ft", _nested_fts_data()) + await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx") + results = ( + await tbl.query() + .nearest_to_text("alpha", columns="payload.text") + .limit(10) + .to_list() + ) + assert len(results) > 0 + assert all(row["payload"]["text"] == "alpha" for row in results) + + @pytest.mark.asyncio + async def test_nested_fts_append_optimize(self, tmp_path): + """After append and optimize the FTS index listing must be stable.""" + db = await lancedb.connect_async( + tmp_path / "fts_opt_db", read_consistency_interval=timedelta(seconds=0) + ) + tbl = await db.create_table("ft", _nested_fts_data()) + await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx") + await tbl.add(_nested_fts_data(nrows=4)) + await tbl.optimize() + col_map = await _columns_by_name_async(tbl) + assert col_map["payload_text_idx"] == ["payload.text"]