mirror of
https://github.com/lancedb/lancedb.git
synced 2026-07-02 18:40:40 +00:00
test(python): add nested field regression matrix tests (#3518)
## Summary Closes #3406 Add a regression matrix in `python/python/tests/test_nested_fields.py` that exercises the full nested field index lifecycle for both the sync and async Python table APIs. The tests will fail if any implementation regresses to leaf-only field names in `list_indices`, `index_stats`, search, or filter results. ## Test scenarios covered **Index types:** BTree scalar, IvfPq vector, FTS **Field-name edge cases (per acceptance criteria):** - `rowId` — camelCase top-level field - `` `row-id` `` — hyphenated top-level field (escaped) - `parent.`\``leaf.name`\`` ` — struct leaf whose name contains a literal dot - `MetaData.userId` — mixed-case nested path - `` `meta-data`.`user-id` `` — hyphenated struct with hyphenated leaf **Lifecycle operations per index type:** - `create_index` / `create_scalar_index` / `create_fts_index` - `list_indices` → verify canonical full dotted path (not leaf name) - `index_stats` → verify row count and index type - Filtered scan (`WHERE nested.field = value`) - Vector search via nested embedding column - FTS search via nested text column - `add` (append) then re-check index listing - `optimize` then re-check index listing **Both sync and async APIs** are covered in parallel test classes. ## Notes Lance forbids top-level field names that contain a literal `.`, so the `` `a.b` `` acceptance-criterion variant is exercised as a *struct leaf* field (`parent.`\``leaf.name`\``) rather than a top-level column.
This commit is contained in:
686
python/python/tests/test_nested_fields.py
Normal file
686
python/python/tests/test_nested_fields.py
Normal file
@@ -0,0 +1,686 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
"""Regression matrix for nested field support across LanceDB Python APIs.
|
||||
|
||||
Covers the lifecycle described in lancedb/lancedb#3406:
|
||||
- Nested scalar, vector, and FTS index creation with full dotted paths
|
||||
- list_indices / index_stats return canonical full paths (not leaf names)
|
||||
- search, filter, append, optimize behaviour
|
||||
- Field-name edge cases: mixed case, literal-dot field names, same-name leaves
|
||||
- Both sync and async Python table APIs
|
||||
|
||||
The matrix uses the following field-name variants from the acceptance criteria:
|
||||
- rowId (camelCase top-level)
|
||||
- `row-id` (hyphenated top-level, escaped)
|
||||
- parent.`leaf.name` (struct leaf whose name contains a literal dot)
|
||||
- MetaData.userId (mixed-case nested path)
|
||||
- `meta-data`.`user-id` (hyphenated struct with hyphenated leaf)
|
||||
|
||||
Note: Lance forbids top-level field names that contain a '.', so the literal-dot
|
||||
edge case is exercised via a struct leaf field (parent.`leaf.name`) instead.
|
||||
"""
|
||||
|
||||
from datetime import timedelta
|
||||
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
import lancedb
|
||||
from lancedb.db import AsyncConnection, DBConnection
|
||||
from lancedb.index import BTree, FTS, IvfPq
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DIM = 8
|
||||
# IvfPq requires at least num_partitions * 256 rows by default; keeping rows
|
||||
# small means we must drop num_sub_vectors and num_partitions very low.
|
||||
NROWS = 256
|
||||
|
||||
|
||||
def _vec(row: int) -> list:
|
||||
return [float((row * DIM + i) % 256) for i in range(DIM)]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sync_db(tmp_path) -> DBConnection:
|
||||
return lancedb.connect(tmp_path)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def async_db(tmp_path) -> AsyncConnection:
|
||||
return await lancedb.connect_async(
|
||||
tmp_path, read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema / data builders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _nested_scalar_schema() -> pa.Schema:
|
||||
"""Schema with nested scalar fields covering the acceptance-criteria names.
|
||||
|
||||
Top-level columns:
|
||||
- rowId int32 (camelCase top-level)
|
||||
- row-id int32 (hyphenated top-level name)
|
||||
- MetaData struct{userId int32} (mixed-case nested path)
|
||||
- meta-data struct{user-id int32} (hyphenated struct + hyphenated leaf)
|
||||
|
||||
Lance disallows top-level field names that contain '.' (e.g. a field
|
||||
literally named 'a.b'), so that edge case is tested separately using
|
||||
_literal_dot_schema() below.
|
||||
"""
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("rowId", pa.int32()),
|
||||
pa.field("row-id", pa.int32()),
|
||||
pa.field(
|
||||
"MetaData",
|
||||
pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
pa.field(
|
||||
"meta-data",
|
||||
pa.struct([pa.field("user-id", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _nested_scalar_data(nrows: int = NROWS) -> pa.Table:
|
||||
schema = _nested_scalar_schema()
|
||||
return pa.table(
|
||||
{
|
||||
"rowId": pa.array(list(range(nrows)), pa.int32()),
|
||||
"row-id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"MetaData": pa.array(
|
||||
[{"userId": i} for i in range(nrows)],
|
||||
type=pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
"meta-data": pa.array(
|
||||
[{"user-id": i} for i in range(nrows)],
|
||||
type=pa.struct([pa.field("user-id", pa.int32())]),
|
||||
),
|
||||
},
|
||||
schema=schema,
|
||||
)
|
||||
|
||||
|
||||
def _literal_dot_schema() -> pa.Schema:
|
||||
"""Schema where a struct *leaf* field is named with a literal dot.
|
||||
|
||||
The path used in the index API is ``parent.`leaf.name` ``.
|
||||
"""
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int32()),
|
||||
pa.field(
|
||||
"parent",
|
||||
pa.struct([pa.field("leaf.name", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _literal_dot_data(nrows: int = NROWS) -> pa.Table:
|
||||
parent_type = pa.struct([pa.field("leaf.name", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"parent": pa.array(
|
||||
[{"leaf.name": i} for i in range(nrows)],
|
||||
type=parent_type,
|
||||
),
|
||||
},
|
||||
schema=_literal_dot_schema(),
|
||||
)
|
||||
|
||||
|
||||
def _same_leaf_schema() -> pa.Schema:
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("StructA", pa.struct([pa.field("userId", pa.int32())])),
|
||||
pa.field("StructB", pa.struct([pa.field("userId", pa.int32())])),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _same_leaf_data(nrows: int = NROWS) -> pa.Table:
|
||||
t = pa.struct([pa.field("userId", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"StructA": pa.array([{"userId": i} for i in range(nrows)], type=t),
|
||||
"StructB": pa.array([{"userId": i * 10} for i in range(nrows)], type=t),
|
||||
},
|
||||
schema=_same_leaf_schema(),
|
||||
)
|
||||
|
||||
|
||||
def _nested_vector_schema() -> pa.Schema:
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int32()),
|
||||
pa.field(
|
||||
"image",
|
||||
pa.struct([pa.field("embedding", pa.list_(pa.float32(), DIM))]),
|
||||
),
|
||||
pa.field(
|
||||
"MetaData",
|
||||
pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _nested_vector_data(nrows: int = NROWS) -> pa.Table:
|
||||
embedding_type = pa.list_(pa.float32(), DIM)
|
||||
image_type = pa.struct([pa.field("embedding", embedding_type)])
|
||||
meta_type = pa.struct([pa.field("userId", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"image": pa.array(
|
||||
[{"embedding": _vec(i)} for i in range(nrows)],
|
||||
type=image_type,
|
||||
),
|
||||
"MetaData": pa.array(
|
||||
[{"userId": i} for i in range(nrows)],
|
||||
type=meta_type,
|
||||
),
|
||||
},
|
||||
schema=_nested_vector_schema(),
|
||||
)
|
||||
|
||||
|
||||
def _nested_fts_schema() -> pa.Schema:
|
||||
return pa.schema(
|
||||
[
|
||||
pa.field("id", pa.int32()),
|
||||
pa.field(
|
||||
"payload",
|
||||
pa.struct([pa.field("text", pa.utf8())]),
|
||||
),
|
||||
pa.field(
|
||||
"MetaData",
|
||||
pa.struct([pa.field("userId", pa.int32())]),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _nested_fts_data(nrows: int = NROWS) -> pa.Table:
|
||||
words = ["alpha", "bravo", "charlie", "delta", "echo"]
|
||||
payload_type = pa.struct([pa.field("text", pa.utf8())])
|
||||
meta_type = pa.struct([pa.field("userId", pa.int32())])
|
||||
return pa.table(
|
||||
{
|
||||
"id": pa.array(list(range(nrows)), pa.int32()),
|
||||
"payload": pa.array(
|
||||
[{"text": words[i % len(words)]} for i in range(nrows)],
|
||||
type=payload_type,
|
||||
),
|
||||
"MetaData": pa.array(
|
||||
[{"userId": i} for i in range(nrows)],
|
||||
type=meta_type,
|
||||
),
|
||||
},
|
||||
schema=_nested_fts_schema(),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _columns_by_name_sync(tbl) -> dict:
|
||||
return {idx.name: idx.columns for idx in tbl.list_indices()}
|
||||
|
||||
|
||||
async def _columns_by_name_async(tbl) -> dict:
|
||||
return {idx.name: idx.columns for idx in await tbl.list_indices()}
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# SYNC TESTS
|
||||
# ===========================================================================
|
||||
#
|
||||
# The sync LanceTable API uses:
|
||||
# - create_scalar_index(column, ...) for scalar (BTree/Bitmap/LabelList) indices
|
||||
# - create_fts_index(column, ...) for full-text-search indices
|
||||
# - create_index(...) for vector indices (older positional API)
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
class TestNestedScalarIndexSync:
|
||||
"""Sync regression matrix for nested scalar (BTree) indices."""
|
||||
|
||||
def test_top_level_camelcase_field(self, sync_db):
|
||||
"""list_indices must return the full camelCase field name."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index("rowId", index_type="BTREE", name="rowid_idx")
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["rowid_idx"] == ["rowId"], (
|
||||
"list_indices must return 'rowId', not a truncated leaf name"
|
||||
)
|
||||
|
||||
def test_top_level_hyphenated_field_escaped(self, sync_db):
|
||||
"""Top-level field 'row-id' (hyphenated) accessed via escaped path."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index("`row-id`", index_type="BTREE", name="rowid_hyph_idx")
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["rowid_hyph_idx"] == ["`row-id`"], (
|
||||
"list_indices must return escaped path '`row-id`'"
|
||||
)
|
||||
|
||||
def test_struct_leaf_literal_dot_field_escaped(self, sync_db):
|
||||
"""Struct leaf with a literal-dot name: parent.`leaf.name`.
|
||||
|
||||
The index listing must use the full escaped path, not just the leaf.
|
||||
"""
|
||||
tbl = sync_db.create_table("t", _literal_dot_data())
|
||||
tbl.create_scalar_index(
|
||||
"parent.`leaf.name`", index_type="BTREE", name="leaf_dot_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["leaf_dot_idx"] == ["parent.`leaf.name`"], (
|
||||
"list_indices must return 'parent.`leaf.name`', not just '`leaf.name`'"
|
||||
)
|
||||
|
||||
def test_nested_mixed_case_path(self, sync_db):
|
||||
"""Nested path MetaData.userId (mixed case) must appear as full path."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="metadata_userid_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["metadata_userid_idx"] == ["MetaData.userId"], (
|
||||
"list_indices must return 'MetaData.userId', not leaf 'userId'"
|
||||
)
|
||||
|
||||
def test_nested_hyphenated_path_escaped(self, sync_db):
|
||||
"""`meta-data`.`user-id` path with both parts escaped."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"`meta-data`.`user-id`", index_type="BTREE", name="metauid_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["metauid_idx"] == ["`meta-data`.`user-id`"], (
|
||||
"list_indices must return '`meta-data`.`user-id`', not 'user-id'"
|
||||
)
|
||||
|
||||
def test_filter_on_nested_mixed_case(self, sync_db):
|
||||
"""WHERE filter on a nested dotted path works after index creation."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="metadata_userid_idx"
|
||||
)
|
||||
rows = tbl.search().where("MetaData.userId = 5").to_list()
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["MetaData"]["userId"] == 5
|
||||
|
||||
def test_append_and_list_indices_stable(self, sync_db):
|
||||
"""After appending rows the index listing must remain unchanged."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="meta_uid_idx"
|
||||
)
|
||||
tbl.add(_nested_scalar_data(nrows=4))
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
def test_optimize_and_list_indices_stable(self, tmp_path):
|
||||
"""After optimize the index listing must still show full paths."""
|
||||
db = lancedb.connect(tmp_path / "opt_db")
|
||||
tbl = db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="meta_uid_idx"
|
||||
)
|
||||
tbl.add(_nested_scalar_data(nrows=4))
|
||||
tbl.optimize()
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
def test_same_name_leaves_are_distinct(self, sync_db):
|
||||
"""Two structs sharing a leaf name must produce distinct index paths."""
|
||||
tbl = sync_db.create_table("same_leaf", _same_leaf_data())
|
||||
tbl.create_scalar_index(
|
||||
"StructA.userId", index_type="BTREE", name="a_userid_idx"
|
||||
)
|
||||
tbl.create_scalar_index(
|
||||
"StructB.userId", index_type="BTREE", name="b_userid_idx"
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["a_userid_idx"] == ["StructA.userId"]
|
||||
assert col_map["b_userid_idx"] == ["StructB.userId"]
|
||||
|
||||
def test_index_stats_canonical_path(self, sync_db):
|
||||
"""index_stats round-trip: create on nested field, verify row count."""
|
||||
tbl = sync_db.create_table("t", _nested_scalar_data())
|
||||
tbl.create_scalar_index(
|
||||
"MetaData.userId", index_type="BTREE", name="meta_uid_idx"
|
||||
)
|
||||
stats = tbl.index_stats("meta_uid_idx")
|
||||
assert stats is not None
|
||||
assert stats.index_type == "BTREE"
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
|
||||
class TestNestedVectorIndexSync:
|
||||
"""Sync regression matrix for nested vector (IvfPq) indices."""
|
||||
|
||||
def test_nested_vector_index_full_path(self, sync_db):
|
||||
"""Listing after vector index creation must use the full dotted path."""
|
||||
tbl = sync_db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"], (
|
||||
"list_indices must return 'image.embedding', not leaf 'embedding'"
|
||||
)
|
||||
|
||||
def test_nested_vector_search(self, sync_db):
|
||||
"""Vector search on nested embedding field must return results."""
|
||||
tbl = sync_db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
results = (
|
||||
tbl.search(_vec(0), vector_column_name="image.embedding").limit(5).to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
|
||||
def test_nested_vector_index_stats(self, sync_db):
|
||||
"""index_stats for a nested vector index must reflect correct row count."""
|
||||
tbl = sync_db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
stats = tbl.index_stats("image_emb_idx")
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
def test_nested_vector_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the vector index listing must be stable."""
|
||||
db = lancedb.connect(tmp_path / "vec_opt_db")
|
||||
tbl = db.create_table("vt", _nested_vector_data())
|
||||
tbl.create_index(
|
||||
num_partitions=2,
|
||||
num_sub_vectors=2,
|
||||
vector_column_name="image.embedding",
|
||||
name="image_emb_idx",
|
||||
)
|
||||
tbl.add(_nested_vector_data(nrows=4))
|
||||
tbl.optimize()
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"]
|
||||
|
||||
|
||||
class TestNestedFTSIndexSync:
|
||||
"""Sync regression matrix for nested FTS indices."""
|
||||
|
||||
def test_nested_fts_index_full_path(self, sync_db):
|
||||
"""FTS index on payload.text must be listed with the full path."""
|
||||
tbl = sync_db.create_table("ft", _nested_fts_data())
|
||||
tbl.create_fts_index("payload.text", name="payload_text_idx")
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"], (
|
||||
"list_indices must return 'payload.text', not leaf 'text'"
|
||||
)
|
||||
|
||||
def test_nested_fts_search(self, sync_db):
|
||||
"""FTS search on a nested text field must return correct results."""
|
||||
tbl = sync_db.create_table("ft", _nested_fts_data())
|
||||
tbl.create_fts_index("payload.text", name="payload_text_idx")
|
||||
results = (
|
||||
tbl.search("alpha", query_type="fts", fts_columns="payload.text")
|
||||
.limit(10)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
assert all(row["payload"]["text"] == "alpha" for row in results)
|
||||
|
||||
def test_nested_fts_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the FTS index listing must be stable."""
|
||||
db = lancedb.connect(tmp_path / "fts_opt_db")
|
||||
tbl = db.create_table("ft", _nested_fts_data())
|
||||
tbl.create_fts_index("payload.text", name="payload_text_idx")
|
||||
tbl.add(_nested_fts_data(nrows=4))
|
||||
tbl.optimize()
|
||||
col_map = _columns_by_name_sync(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"]
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# ASYNC TESTS
|
||||
# ===========================================================================
|
||||
#
|
||||
# The async AsyncTable API uses create_index(column, config=...) uniformly
|
||||
# for scalar, vector, and FTS indices.
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
class TestNestedScalarIndexAsync:
|
||||
"""Async regression matrix for nested scalar (BTree) indices."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_top_level_camelcase_field(self, async_db):
|
||||
"""list_indices must return the full camelCase field name."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("rowId", config=BTree(), name="rowid_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["rowid_idx"] == ["rowId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_top_level_hyphenated_field_escaped(self, async_db):
|
||||
"""Hyphenated top-level field accessed via escaped path."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("`row-id`", config=BTree(), name="rowid_hyph_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["rowid_hyph_idx"] == ["`row-id`"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_struct_leaf_literal_dot_field_escaped(self, async_db):
|
||||
"""Struct leaf with a literal-dot name: parent.`leaf.name`."""
|
||||
tbl = await async_db.create_table("t", _literal_dot_data())
|
||||
await tbl.create_index(
|
||||
"parent.`leaf.name`", config=BTree(), name="leaf_dot_idx"
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["leaf_dot_idx"] == ["parent.`leaf.name`"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_mixed_case_path(self, async_db):
|
||||
"""Mixed-case nested path MetaData.userId must appear as full path."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index(
|
||||
"MetaData.userId", config=BTree(), name="metadata_userid_idx"
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["metadata_userid_idx"] == ["MetaData.userId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_hyphenated_path_escaped(self, async_db):
|
||||
"""`meta-data`.`user-id` path with both parts escaped."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index(
|
||||
"`meta-data`.`user-id`", config=BTree(), name="metauid_idx"
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["metauid_idx"] == ["`meta-data`.`user-id`"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_filter_on_nested_mixed_case(self, async_db):
|
||||
"""WHERE filter on a nested dotted path works after index creation."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index(
|
||||
"MetaData.userId", config=BTree(), name="metadata_userid_idx"
|
||||
)
|
||||
rows = await tbl.query().where("MetaData.userId = 5").to_list()
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["MetaData"]["userId"] == 5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_index_stats_canonical_path(self, async_db):
|
||||
"""index_stats round-trip: create on nested field, verify stats."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx")
|
||||
stats = await tbl.index_stats("meta_uid_idx")
|
||||
assert stats is not None
|
||||
assert stats.index_type == "BTREE"
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_append_and_list_indices_stable(self, async_db):
|
||||
"""After appending rows the index listing must remain unchanged."""
|
||||
tbl = await async_db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx")
|
||||
await tbl.add(_nested_scalar_data(nrows=4))
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_optimize_and_list_indices_stable(self, tmp_path):
|
||||
"""After optimize the index listing must still show full paths."""
|
||||
db = await lancedb.connect_async(
|
||||
tmp_path / "opt_db", read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
tbl = await db.create_table("t", _nested_scalar_data())
|
||||
await tbl.create_index("MetaData.userId", config=BTree(), name="meta_uid_idx")
|
||||
await tbl.add(_nested_scalar_data(nrows=4))
|
||||
await tbl.optimize()
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["meta_uid_idx"] == ["MetaData.userId"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_same_name_leaves_are_distinct(self, async_db):
|
||||
"""Two structs sharing a leaf name must produce distinct index paths."""
|
||||
tbl = await async_db.create_table("same_leaf", _same_leaf_data())
|
||||
await tbl.create_index("StructA.userId", config=BTree(), name="a_userid_idx")
|
||||
await tbl.create_index("StructB.userId", config=BTree(), name="b_userid_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["a_userid_idx"] == ["StructA.userId"]
|
||||
assert col_map["b_userid_idx"] == ["StructB.userId"]
|
||||
|
||||
|
||||
class TestNestedVectorIndexAsync:
|
||||
"""Async regression matrix for nested vector (IvfPq) indices."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_index_full_path(self, async_db):
|
||||
"""Listing after vector index creation must use the full dotted path."""
|
||||
tbl = await async_db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_search(self, async_db):
|
||||
"""Vector search on nested embedding field must return results."""
|
||||
tbl = await async_db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to(_vec(0))
|
||||
.column("image.embedding")
|
||||
.limit(5)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_index_stats(self, async_db):
|
||||
"""index_stats for a nested vector index must reflect correct row count."""
|
||||
tbl = await async_db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
stats = await tbl.index_stats("image_emb_idx")
|
||||
assert stats is not None
|
||||
assert stats.num_indexed_rows == NROWS
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_vector_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the vector index listing must be stable."""
|
||||
db = await lancedb.connect_async(
|
||||
tmp_path / "vec_opt_db", read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
tbl = await db.create_table("vt", _nested_vector_data())
|
||||
await tbl.create_index(
|
||||
"image.embedding",
|
||||
config=IvfPq(num_partitions=2, num_sub_vectors=2),
|
||||
name="image_emb_idx",
|
||||
)
|
||||
await tbl.add(_nested_vector_data(nrows=4))
|
||||
await tbl.optimize()
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["image_emb_idx"] == ["image.embedding"]
|
||||
|
||||
|
||||
class TestNestedFTSIndexAsync:
|
||||
"""Async regression matrix for nested FTS indices."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_fts_index_full_path(self, async_db):
|
||||
"""FTS index on payload.text must be listed with the full path."""
|
||||
tbl = await async_db.create_table("ft", _nested_fts_data())
|
||||
await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx")
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_fts_search(self, async_db):
|
||||
"""FTS search on a nested text field must return correct results."""
|
||||
tbl = await async_db.create_table("ft", _nested_fts_data())
|
||||
await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx")
|
||||
results = (
|
||||
await tbl.query()
|
||||
.nearest_to_text("alpha", columns="payload.text")
|
||||
.limit(10)
|
||||
.to_list()
|
||||
)
|
||||
assert len(results) > 0
|
||||
assert all(row["payload"]["text"] == "alpha" for row in results)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_nested_fts_append_optimize(self, tmp_path):
|
||||
"""After append and optimize the FTS index listing must be stable."""
|
||||
db = await lancedb.connect_async(
|
||||
tmp_path / "fts_opt_db", read_consistency_interval=timedelta(seconds=0)
|
||||
)
|
||||
tbl = await db.create_table("ft", _nested_fts_data())
|
||||
await tbl.create_index("payload.text", config=FTS(), name="payload_text_idx")
|
||||
await tbl.add(_nested_fts_data(nrows=4))
|
||||
await tbl.optimize()
|
||||
col_map = await _columns_by_name_async(tbl)
|
||||
assert col_map["payload_text_idx"] == ["payload.text"]
|
||||
Reference in New Issue
Block a user