feat: create bitmap and label list scalar index using python async api (#1529)

* Expose `bitmap` and `LabelList` scalar index type via Rust and Async
Python API
* Add documents
This commit is contained in:
Lei Xu
2024-08-11 09:16:11 -07:00
committed by GitHub
parent 613f3063b9
commit b2317c904d
9 changed files with 384 additions and 26 deletions

View File

@@ -8,7 +8,7 @@ from ._lancedb import (
)
class BTree(object):
class BTree:
"""Describes a btree index configuration
A btree index is an index on scalar columns. The index stores a copy of the
@@ -22,7 +22,8 @@ class BTree(object):
sizeof(Scalar) * 4096 bytes to find the correct row ids.
This index is good for scalar columns with mostly distinct values and does best
when the query is highly selective.
when the query is highly selective. It works with numeric, temporal, and string
columns.
The btree index does not currently have any parameters though parameters such as
the block size may be added in the future.
@@ -32,7 +33,44 @@ class BTree(object):
self._inner = LanceDbIndex.btree()
class IvfPq(object):
class Bitmap:
"""Describe a Bitmap index configuration.
A `Bitmap` index stores a bitmap for each distinct value in the column for
every row.
This index works best for low-cardinality numeric or string columns,
where the number of unique values is small (i.e., less than a few thousands).
`Bitmap` index can accelerate the following filters:
- `<`, `<=`, `=`, `>`, `>=`
- `IN (value1, value2, ...)`
- `between (value1, value2)`
- `is null`
For example, a bitmap index with a table with 1Bi rows, and 128 distinct values,
requires 128 / 8 * 1Bi bytes on disk.
"""
def __init__(self):
self._inner = LanceDbIndex.bitmap()
class LabelList:
"""Describe a LabelList index configuration.
`LabelList` is a scalar index that can be used on `List<T>` columns to
support queries with `array_contains_all` and `array_contains_any`
using an underlying bitmap index.
For example, it works with `tags`, `categories`, `keywords`, etc.
"""
def __init__(self):
self._inner = LanceDbIndex.label_list()
class IvfPq:
"""Describes an IVF PQ Index
This index stores a compressed (quantized) copy of every vector. These vectors

View File

@@ -1,15 +1,5 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from __future__ import annotations
@@ -61,7 +51,7 @@ if TYPE_CHECKING:
from lance.dataset import CleanupStats, ReaderLike
from ._lancedb import Table as LanceDBTable, OptimizeStats
from .db import LanceDBConnection
from .index import BTree, IndexConfig, IvfPq
from .index import BTree, IndexConfig, IvfPq, Bitmap, LabelList
pd = safe_import_pandas()
@@ -2123,7 +2113,7 @@ class AsyncTable:
column: str,
*,
replace: Optional[bool] = None,
config: Optional[Union[IvfPq, BTree]] = None,
config: Optional[Union[IvfPq, BTree, Bitmap, LabelList]] = None,
):
"""Create an index to speed up queries

View File

@@ -1,10 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from datetime import timedelta
import random
import pyarrow as pa
import pytest
import pytest_asyncio
from lancedb import AsyncConnection, AsyncTable, connect_async
from lancedb.index import BTree, IvfPq
from lancedb.index import BTree, IvfPq, Bitmap, LabelList
@pytest_asyncio.fixture
@@ -25,8 +29,11 @@ NROWS = 256
async def some_table(db_async):
data = pa.Table.from_pydict(
{
"id": list(range(256)),
"id": list(range(NROWS)),
"vector": sample_fixed_size_list_array(NROWS, DIM),
"tags": [
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
],
}
)
return await db_async.create_table(
@@ -53,6 +60,22 @@ async def test_create_scalar_index(some_table: AsyncTable):
await some_table.create_index("id", config=BTree())
@pytest.mark.asyncio
async def test_create_bitmap_index(some_table: AsyncTable):
await some_table.create_index("id", config=Bitmap())
# TODO: Fix via https://github.com/lancedb/lance/issues/2039
# indices = await some_table.list_indices()
# assert str(indices) == '[Index(Bitmap, columns=["id"])]'
@pytest.mark.asyncio
async def test_create_label_list_index(some_table: AsyncTable):
await some_table.create_index("tags", config=LabelList())
# TODO: Fix via https://github.com/lancedb/lance/issues/2039
# indices = await some_table.list_indices()
# assert str(indices) == '[Index(LabelList, columns=["id"])]'
@pytest.mark.asyncio
async def test_create_vector_index(some_table: AsyncTable):
# Can create