feat: create bitmap and label list scalar index using python async api (#1529)

* Expose `bitmap` and `LabelList` scalar index type via Rust and Async
Python API
* Add documents
This commit is contained in:
Lei Xu
2024-08-11 09:16:11 -07:00
committed by GitHub
parent 613f3063b9
commit b2317c904d
9 changed files with 384 additions and 26 deletions

View File

@@ -113,6 +113,10 @@ lists the indices that LanceDb supports.
::: lancedb.index.BTree
::: lancedb.index.Bitmap
::: lancedb.index.LabelList
::: lancedb.index.IvfPq
## Querying (Asynchronous)

View File

@@ -8,7 +8,7 @@ from ._lancedb import (
)
class BTree(object):
class BTree:
"""Describes a btree index configuration
A btree index is an index on scalar columns. The index stores a copy of the
@@ -22,7 +22,8 @@ class BTree(object):
sizeof(Scalar) * 4096 bytes to find the correct row ids.
This index is good for scalar columns with mostly distinct values and does best
when the query is highly selective.
when the query is highly selective. It works with numeric, temporal, and string
columns.
The btree index does not currently have any parameters though parameters such as
the block size may be added in the future.
@@ -32,7 +33,44 @@ class BTree(object):
self._inner = LanceDbIndex.btree()
class IvfPq(object):
class Bitmap:
"""Describe a Bitmap index configuration.
A `Bitmap` index stores a bitmap for each distinct value in the column for
every row.
This index works best for low-cardinality numeric or string columns,
where the number of unique values is small (i.e., less than a few thousands).
`Bitmap` index can accelerate the following filters:
- `<`, `<=`, `=`, `>`, `>=`
- `IN (value1, value2, ...)`
- `between (value1, value2)`
- `is null`
For example, a bitmap index with a table with 1Bi rows, and 128 distinct values,
requires 128 / 8 * 1Bi bytes on disk.
"""
def __init__(self):
self._inner = LanceDbIndex.bitmap()
class LabelList:
"""Describe a LabelList index configuration.
`LabelList` is a scalar index that can be used on `List<T>` columns to
support queries with `array_contains_all` and `array_contains_any`
using an underlying bitmap index.
For example, it works with `tags`, `categories`, `keywords`, etc.
"""
def __init__(self):
self._inner = LanceDbIndex.label_list()
class IvfPq:
"""Describes an IVF PQ Index
This index stores a compressed (quantized) copy of every vector. These vectors

View File

@@ -1,15 +1,5 @@
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from __future__ import annotations
@@ -61,7 +51,7 @@ if TYPE_CHECKING:
from lance.dataset import CleanupStats, ReaderLike
from ._lancedb import Table as LanceDBTable, OptimizeStats
from .db import LanceDBConnection
from .index import BTree, IndexConfig, IvfPq
from .index import BTree, IndexConfig, IvfPq, Bitmap, LabelList
pd = safe_import_pandas()
@@ -2123,7 +2113,7 @@ class AsyncTable:
column: str,
*,
replace: Optional[bool] = None,
config: Optional[Union[IvfPq, BTree]] = None,
config: Optional[Union[IvfPq, BTree, Bitmap, LabelList]] = None,
):
"""Create an index to speed up queries

View File

@@ -1,10 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
from datetime import timedelta
import random
import pyarrow as pa
import pytest
import pytest_asyncio
from lancedb import AsyncConnection, AsyncTable, connect_async
from lancedb.index import BTree, IvfPq
from lancedb.index import BTree, IvfPq, Bitmap, LabelList
@pytest_asyncio.fixture
@@ -25,8 +29,11 @@ NROWS = 256
async def some_table(db_async):
data = pa.Table.from_pydict(
{
"id": list(range(256)),
"id": list(range(NROWS)),
"vector": sample_fixed_size_list_array(NROWS, DIM),
"tags": [
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
],
}
)
return await db_async.create_table(
@@ -53,6 +60,22 @@ async def test_create_scalar_index(some_table: AsyncTable):
await some_table.create_index("id", config=BTree())
@pytest.mark.asyncio
async def test_create_bitmap_index(some_table: AsyncTable):
await some_table.create_index("id", config=Bitmap())
# TODO: Fix via https://github.com/lancedb/lance/issues/2039
# indices = await some_table.list_indices()
# assert str(indices) == '[Index(Bitmap, columns=["id"])]'
@pytest.mark.asyncio
async def test_create_label_list_index(some_table: AsyncTable):
await some_table.create_index("tags", config=LabelList())
# TODO: Fix via https://github.com/lancedb/lance/issues/2039
# indices = await some_table.list_indices()
# assert str(indices) == '[Index(LabelList, columns=["id"])]'
@pytest.mark.asyncio
async def test_create_vector_index(some_table: AsyncTable):
# Can create

View File

@@ -84,6 +84,20 @@ impl Index {
inner: Mutex::new(Some(LanceDbIndex::BTree(BTreeIndexBuilder::default()))),
})
}
#[staticmethod]
pub fn bitmap() -> PyResult<Self> {
Ok(Self {
inner: Mutex::new(Some(LanceDbIndex::Bitmap(Default::default()))),
})
}
#[staticmethod]
pub fn label_list() -> PyResult<Self> {
Ok(Self {
inner: Mutex::new(Some(LanceDbIndex::LabelList(Default::default()))),
})
}
}
#[pyclass(get_all)]

View File

@@ -21,19 +21,47 @@ use serde_with::skip_serializing_none;
use crate::{table::TableInternal, Result};
use self::{
scalar::BTreeIndexBuilder,
scalar::{BTreeIndexBuilder, BitmapIndexBuilder, LabelListIndexBuilder},
vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
};
pub mod scalar;
pub mod vector;
/// Supported index types.
pub enum Index {
Auto,
/// A `BTree` index is an sorted index on scalar columns.
/// This index is good for scalar columns with mostly distinct values and does best when
/// the query is highly selective. It can apply to numeric, temporal, and string columns.
///
/// BTree index is useful to answer queries with
/// equality (`=`), inequality (`>`, `>=`, `<`, `<=`),and range queries.
///
/// This is the default index type for scalar columns.
BTree(BTreeIndexBuilder),
/// A `Bitmap` index stores a bitmap for each distinct value in the column for every row.
///
/// This index works best for low-cardinality columns,
/// where the number of unique values is small (i.e., less than a few hundreds).
Bitmap(BitmapIndexBuilder),
/// [LabelListIndexBuilder] is a scalar index that can be used on `List<T>` columns to
/// support queries with `array_contains_all` and `array_contains_any`
/// using an underlying bitmap index.
LabelList(LabelListIndexBuilder),
/// Full text search index using bm25.
FTS(FtsIndexBuilder),
/// IVF index with Product Quantization
IvfPq(IvfPqIndexBuilder),
/// IVF-HNSW index with Product Quantization
IvfHnswPq(IvfHnswPqIndexBuilder),
/// IVF-HNSW index with Scalar Quantization
IvfHnswSq(IvfHnswSqIndexBuilder),
}
@@ -74,10 +102,14 @@ impl IndexBuilder {
#[derive(Debug, Clone, PartialEq)]
pub enum IndexType {
// Vector
IvfPq,
IvfHnswPq,
IvfHnswSq,
// Scalar
BTree,
Bitmap,
LabelList,
}
/// A description of an index currently configured on a column

View File

@@ -29,6 +29,25 @@ pub struct BTreeIndexBuilder {}
impl BTreeIndexBuilder {}
/// Builder for a Bitmap index.
///
/// It is a scalar index that stores a bitmap for each possible value
///
/// This index works best for low-cardinality (i.e., less than 1000 unique values) columns,
/// where the number of unique values is small.
/// The bitmap stores a list of row ids where the value is present.
#[derive(Debug, Clone, Default)]
pub struct BitmapIndexBuilder {}
/// Builder for LabelList index.
///
/// [LabeListIndexBuilder] is a scalar index that can be used on `List<T>` columns to
/// support queries with `array_contains_all` and `array_contains_any`
/// using an underlying bitmap index.
///
#[derive(Debug, Clone, Default)]
pub struct LabelListIndexBuilder {}
/// Builder for a full text search index
///
/// A full text search index is an index on a string column that allows for full text search

View File

@@ -13,7 +13,7 @@
// limitations under the License.
//! [LanceDB](https://github.com/lancedb/lancedb) is an open-source database for vector-search built with persistent storage,
//! which greatly simplifies retrevial, filtering and management of embeddings.
//! which greatly simplifies retrieval, filtering and management of embeddings.
//!
//! The key features of LanceDB include:
//! - Production-scale vector search with no servers to manage.
@@ -133,6 +133,13 @@
//!
//! #### Create vector index (IVF_PQ)
//!
//! LanceDB is capable to automatically create appropriate indices based on the data types
//! of the columns. For example,
//!
//! * If a column has a data type of `FixedSizeList<Float16/Float32>`,
//! LanceDB will create a `IVF-PQ` vector index with default parameters.
//! * Otherwise, it creates a `BTree` index by default.
//!
//! ```no_run
//! # use std::sync::Arc;
//! # use arrow_array::{FixedSizeListArray, types::Float32Type, RecordBatch,
@@ -150,7 +157,10 @@
//! # });
//! ```
//!
//! #### Open table and run search
//!
//! User can also specify the index type explicitly, see [`Table::create_index`].
//!
//! #### Open table and search
//!
//! ```rust
//! # use std::sync::Arc;

View File

@@ -573,7 +573,8 @@ impl Table {
/// There are a variety of indices available. They are described more in
/// [`crate::index::Index`]. The simplest thing to do is to use `index::Index::Auto` which
/// will attempt to create the most useful index based on the column type and column
/// statistics.
/// statistics. `BTree` index is created by default for numeric, temporal, and
/// string columns.
///
/// Once an index is created it will remain until the data is overwritten (e.g. an
/// add operation with mode overwrite) or the indexed column is dropped.
@@ -607,10 +608,21 @@ impl Table {
/// .await
/// .unwrap();
/// # let tbl = db.open_table("idx_test").execute().await.unwrap();
/// // Create IVF PQ index on the "vector" column by default.
/// tbl.create_index(&["vector"], Index::Auto)
/// .execute()
/// .await
/// .unwrap();
/// // Create a BTree index on the "id" column.
/// tbl.create_index(&["id"], Index::Auto)
/// .execute()
/// .await
/// .unwrap();
/// // Create a LabelList index on the "tags" column.
/// tbl.create_index(&["tags"], Index::LabelList(Default::default()))
/// .execute()
/// .await
/// .unwrap();
/// # });
/// ```
pub fn create_index(&self, columns: &[impl AsRef<str>], index: Index) -> IndexBuilder {
@@ -1054,6 +1066,20 @@ impl NativeTable {
)
}
fn supported_bitmap_data_type(dtype: &DataType) -> bool {
dtype.is_integer() || matches!(dtype, DataType::Utf8)
}
fn supported_label_list_data_type(dtype: &DataType) -> bool {
match dtype {
DataType::List(field) => Self::supported_bitmap_data_type(field.data_type()),
DataType::FixedSizeList(field, _) => {
Self::supported_bitmap_data_type(field.data_type())
}
_ => false,
}
}
fn supported_fts_data_type(dtype: &DataType) -> bool {
matches!(dtype, DataType::Utf8 | DataType::LargeUtf8)
}
@@ -1519,7 +1545,61 @@ impl NativeTable {
dataset
.create_index(
&[field.name()],
IndexType::Scalar,
IndexType::BTree,
None,
&lance_idx_params,
opts.replace,
)
.await?;
Ok(())
}
async fn create_bitmap_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
if !Self::supported_bitmap_data_type(field.data_type()) {
return Err(Error::Schema {
message: format!(
"A Bitmap index cannot be created on the field `{}` which has data type {}",
field.name(),
field.data_type()
),
});
}
let mut dataset = self.dataset.get_mut().await?;
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
force_index_type: Some(lance_index::scalar::ScalarIndexType::Bitmap),
};
dataset
.create_index(
&[field.name()],
IndexType::Bitmap,
None,
&lance_idx_params,
opts.replace,
)
.await?;
Ok(())
}
async fn create_label_list_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
if !Self::supported_label_list_data_type(field.data_type()) {
return Err(Error::Schema {
message: format!(
"A LabelList index cannot be created on the field `{}` which has data type {}",
field.name(),
field.data_type()
),
});
}
let mut dataset = self.dataset.get_mut().await?;
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
force_index_type: Some(lance_index::scalar::ScalarIndexType::LabelList),
};
dataset
.create_index(
&[field.name()],
IndexType::LabelList,
None,
&lance_idx_params,
opts.replace,
@@ -1690,6 +1770,8 @@ impl TableInternal for NativeTable {
match opts.index {
Index::Auto => self.create_auto_index(field, opts).await,
Index::BTree(_) => self.create_btree_index(field, opts).await,
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
Index::LabelList(_) => self.create_label_list_index(field, opts).await,
Index::FTS(_) => self.create_fts_index(field, opts).await,
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
Index::IvfHnswPq(ivf_hnsw_pq) => {
@@ -2013,6 +2095,7 @@ mod tests {
use std::time::Duration;
use arrow_array::{
builder::{ListBuilder, StringBuilder},
Array, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array,
Int32Array, Int64Array, LargeStringArray, RecordBatch, RecordBatchIterator,
RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray,
@@ -2022,17 +2105,17 @@ mod tests {
use arrow_schema::{DataType, Field, Schema, TimeUnit};
use futures::TryStreamExt;
use lance::dataset::{Dataset, WriteMode};
use lance::index::DatasetIndexInternalExt;
use lance::io::{ObjectStoreParams, WrappingObjectStore};
use rand::Rng;
use tempfile::tempdir;
use super::*;
use crate::connect;
use crate::connection::ConnectBuilder;
use crate::index::scalar::BTreeIndexBuilder;
use crate::query::{ExecutableQuery, QueryBase};
use super::*;
#[tokio::test]
async fn test_open() {
let tmp_dir = tempdir().unwrap();
@@ -2997,6 +3080,151 @@ mod tests {
);
}
#[tokio::test]
async fn test_create_bitmap_index() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("category", DataType::Utf8, true),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![
Arc::new(Int32Array::from_iter_values(0..100)),
Arc::new(StringArray::from_iter_values(
(0..100).map(|i| format!("category_{}", i % 5)),
)),
],
)
.unwrap();
let table = conn
.create_table(
"test_bitmap",
RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()),
)
.execute()
.await
.unwrap();
// Create bitmap index on the "category" column
table
.create_index(&["category"], Index::Bitmap(Default::default()))
.execute()
.await
.unwrap();
// Verify the index was created
let index_configs = table.list_indices().await.unwrap();
assert_eq!(index_configs.len(), 1);
let index = index_configs.into_iter().next().unwrap();
// TODO: Fix via https://github.com/lancedb/lance/issues/2039
// assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
assert_eq!(index.columns, vec!["category".to_string()]);
// For now, just open the index to verify its type
let lance_dataset = table.as_native().unwrap().dataset.get().await.unwrap();
let indices = lance_dataset
.load_indices_by_name(&index.name)
.await
.unwrap();
let index_meta = &indices[0];
let idx = lance_dataset
.open_scalar_index("category", &index_meta.uuid.to_string())
.await
.unwrap();
assert_eq!(idx.index_type(), IndexType::Bitmap);
}
#[tokio::test]
async fn test_create_label_list_index() {
let tmp_dir = tempdir().unwrap();
let uri = tmp_dir.path().to_str().unwrap();
let conn = ConnectBuilder::new(uri).execute().await.unwrap();
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new(
"tags",
DataType::List(Field::new("item", DataType::Utf8, true).into()),
true,
),
]));
const TAGS: [&str; 3] = ["cat", "dog", "fish"];
let values_builder = StringBuilder::new();
let mut builder = ListBuilder::new(values_builder);
for i in 0..120 {
builder.values().append_value(TAGS[i % 3].to_string());
if i % 3 == 0 {
builder.append(true)
}
}
let tags = Arc::new(builder.finish());
let batch = RecordBatch::try_new(
schema.clone(),
vec![Arc::new(Int32Array::from_iter_values(0..40)), tags],
)
.unwrap();
let table = conn
.create_table(
"test_bitmap",
RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()),
)
.execute()
.await
.unwrap();
// Can not create btree or bitmap index on list column
assert!(table
.create_index(&["tags"], Index::BTree(Default::default()))
.execute()
.await
.is_err());
assert!(table
.create_index(&["tags"], Index::Bitmap(Default::default()))
.execute()
.await
.is_err());
// Create bitmap index on the "category" column
table
.create_index(&["tags"], Index::LabelList(Default::default()))
.execute()
.await
.unwrap();
// Verify the index was created
let index_configs = table.list_indices().await.unwrap();
assert_eq!(index_configs.len(), 1);
let index = index_configs.into_iter().next().unwrap();
// TODO: Fix via https://github.com/lancedb/lance/issues/2039
// assert_eq!(index.index_type, crate::index::IndexType::LabelList);
assert_eq!(index.columns, vec!["tags".to_string()]);
// For now, just open the index to verify its type
let lance_dataset = table.as_native().unwrap().dataset.get().await.unwrap();
let indices = lance_dataset
.load_indices_by_name(&index.name)
.await
.unwrap();
let index_meta = &indices[0];
let idx = lance_dataset
.open_scalar_index("tags", &index_meta.uuid.to_string())
.await
.unwrap();
assert_eq!(idx.index_type(), IndexType::LabelList);
}
#[tokio::test]
async fn test_read_consistency_interval() {
let intervals = vec![