mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-25 14:29:56 +00:00
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Added the ability to prewarm (load into memory) table indexes via new methods in Python, Node.js, and Rust APIs, potentially reducing cold-start query latency. - **Bug Fixes** - Ensured prewarming an index does not interfere with subsequent search operations. - **Tests** - Introduced new test cases to verify full-text search index creation, prewarming, and search functionalities in both Python and Node.js. - **Chores** - Updated dependencies for improved compatibility and performance. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: Lu Qiu <luqiujob@gmail.com>
220 lines
7.4 KiB
Python
220 lines
7.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
|
|
|
from datetime import timedelta
|
|
import random
|
|
|
|
import pyarrow as pa
|
|
import pytest
|
|
import pytest_asyncio
|
|
from lancedb import AsyncConnection, AsyncTable, connect_async
|
|
from lancedb.index import BTree, IvfFlat, IvfPq, Bitmap, LabelList, HnswPq, HnswSq, FTS
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def db_async(tmp_path) -> AsyncConnection:
|
|
return await connect_async(tmp_path, read_consistency_interval=timedelta(seconds=0))
|
|
|
|
|
|
def sample_fixed_size_list_array(nrows, dim):
|
|
vector_data = pa.array([float(i) for i in range(dim * nrows)], pa.float32())
|
|
return pa.FixedSizeListArray.from_arrays(vector_data, dim)
|
|
|
|
|
|
DIM = 8
|
|
NROWS = 256
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def some_table(db_async):
|
|
data = pa.Table.from_pydict(
|
|
{
|
|
"id": list(range(NROWS)),
|
|
"vector": sample_fixed_size_list_array(NROWS, DIM),
|
|
"fsb": pa.array([bytes([i]) for i in range(NROWS)], pa.binary(1)),
|
|
"tags": [
|
|
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
|
],
|
|
}
|
|
)
|
|
return await db_async.create_table(
|
|
"some_table",
|
|
data,
|
|
)
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def binary_table(db_async):
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"vector": [i] * 128,
|
|
}
|
|
for i in range(NROWS)
|
|
]
|
|
return await db_async.create_table(
|
|
"binary_table",
|
|
data,
|
|
schema=pa.schema(
|
|
[
|
|
pa.field("id", pa.int64()),
|
|
pa.field("vector", pa.list_(pa.uint8(), 128)),
|
|
]
|
|
),
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_scalar_index(some_table: AsyncTable):
|
|
# Can create
|
|
await some_table.create_index("id")
|
|
# Can recreate if replace=True
|
|
await some_table.create_index("id", replace=True)
|
|
indices = await some_table.list_indices()
|
|
assert str(indices) == '[Index(BTree, columns=["id"], name="id_idx")]'
|
|
assert len(indices) == 1
|
|
assert indices[0].index_type == "BTree"
|
|
assert indices[0].columns == ["id"]
|
|
# Can't recreate if replace=False
|
|
with pytest.raises(RuntimeError, match="already exists"):
|
|
await some_table.create_index("id", replace=False)
|
|
# can also specify index type
|
|
await some_table.create_index("id", config=BTree())
|
|
|
|
await some_table.drop_index("id_idx")
|
|
indices = await some_table.list_indices()
|
|
assert len(indices) == 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
|
await some_table.create_index("fsb", config=BTree())
|
|
indices = await some_table.list_indices()
|
|
assert str(indices) == '[Index(BTree, columns=["fsb"], name="fsb_idx")]'
|
|
assert len(indices) == 1
|
|
assert indices[0].index_type == "BTree"
|
|
assert indices[0].columns == ["fsb"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_bitmap_index(some_table: AsyncTable):
|
|
await some_table.create_index("id", config=Bitmap())
|
|
indices = await some_table.list_indices()
|
|
assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]'
|
|
indices = await some_table.list_indices()
|
|
assert len(indices) == 1
|
|
index_name = indices[0].name
|
|
stats = await some_table.index_stats(index_name)
|
|
assert stats.index_type == "BITMAP"
|
|
assert stats.distance_type is None
|
|
assert stats.num_indexed_rows == await some_table.count_rows()
|
|
assert stats.num_unindexed_rows == 0
|
|
assert stats.num_indices == 1
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_label_list_index(some_table: AsyncTable):
|
|
await some_table.create_index("tags", config=LabelList())
|
|
indices = await some_table.list_indices()
|
|
assert str(indices) == '[Index(LabelList, columns=["tags"], name="tags_idx")]'
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_full_text_search_index(some_table: AsyncTable):
|
|
await some_table.create_index("tags", config=FTS(with_position=False))
|
|
indices = await some_table.list_indices()
|
|
assert str(indices) == '[Index(FTS, columns=["tags"], name="tags_idx")]'
|
|
|
|
await some_table.prewarm_index("tags_idx")
|
|
|
|
res = await (await some_table.search("tag0")).to_arrow()
|
|
assert res.num_rows > 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_vector_index(some_table: AsyncTable):
|
|
# Can create
|
|
await some_table.create_index("vector")
|
|
# Can recreate if replace=True
|
|
await some_table.create_index("vector", replace=True)
|
|
# Can't recreate if replace=False
|
|
with pytest.raises(RuntimeError, match="already exists"):
|
|
await some_table.create_index("vector", replace=False)
|
|
# Can also specify index type
|
|
await some_table.create_index("vector", config=IvfPq(num_partitions=100))
|
|
indices = await some_table.list_indices()
|
|
assert len(indices) == 1
|
|
assert indices[0].index_type == "IvfPq"
|
|
assert indices[0].columns == ["vector"]
|
|
assert indices[0].name == "vector_idx"
|
|
|
|
stats = await some_table.index_stats("vector_idx")
|
|
assert stats.index_type == "IVF_PQ"
|
|
assert stats.distance_type == "l2"
|
|
assert stats.num_indexed_rows == await some_table.count_rows()
|
|
assert stats.num_unindexed_rows == 0
|
|
assert stats.num_indices == 1
|
|
assert stats.loss >= 0.0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_4bit_ivfpq_index(some_table: AsyncTable):
|
|
# Can create
|
|
await some_table.create_index("vector", config=IvfPq(num_bits=4))
|
|
# Can recreate if replace=True
|
|
await some_table.create_index("vector", config=IvfPq(num_bits=4), replace=True)
|
|
# Can't recreate if replace=False
|
|
with pytest.raises(RuntimeError, match="already exists"):
|
|
await some_table.create_index("vector", replace=False)
|
|
indices = await some_table.list_indices()
|
|
assert len(indices) == 1
|
|
assert indices[0].index_type == "IvfPq"
|
|
assert indices[0].columns == ["vector"]
|
|
assert indices[0].name == "vector_idx"
|
|
|
|
stats = await some_table.index_stats("vector_idx")
|
|
assert stats.index_type == "IVF_PQ"
|
|
assert stats.distance_type == "l2"
|
|
assert stats.num_indexed_rows == await some_table.count_rows()
|
|
assert stats.num_unindexed_rows == 0
|
|
assert stats.num_indices == 1
|
|
assert stats.loss >= 0.0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_hnswpq_index(some_table: AsyncTable):
|
|
await some_table.create_index("vector", config=HnswPq(num_partitions=10))
|
|
indices = await some_table.list_indices()
|
|
assert len(indices) == 1
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_hnswsq_index(some_table: AsyncTable):
|
|
await some_table.create_index("vector", config=HnswSq(num_partitions=10))
|
|
indices = await some_table.list_indices()
|
|
assert len(indices) == 1
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_create_index_with_binary_vectors(binary_table: AsyncTable):
|
|
await binary_table.create_index(
|
|
"vector", config=IvfFlat(distance_type="hamming", num_partitions=10)
|
|
)
|
|
indices = await binary_table.list_indices()
|
|
assert len(indices) == 1
|
|
assert indices[0].index_type == "IvfFlat"
|
|
assert indices[0].columns == ["vector"]
|
|
assert indices[0].name == "vector_idx"
|
|
|
|
stats = await binary_table.index_stats("vector_idx")
|
|
assert stats.index_type == "IVF_FLAT"
|
|
assert stats.distance_type == "hamming"
|
|
assert stats.num_indexed_rows == await binary_table.count_rows()
|
|
assert stats.num_unindexed_rows == 0
|
|
assert stats.num_indices == 1
|
|
|
|
# the dataset contains vectors with all values from 0 to 255
|
|
for v in range(256):
|
|
res = await binary_table.query().nearest_to([v] * 128).to_arrow()
|
|
assert res["id"][0].as_py() == v
|