diff --git a/docs/src/python/python.md b/docs/src/python/python.md index b6ffeb18..a9db4060 100644 --- a/docs/src/python/python.md +++ b/docs/src/python/python.md @@ -113,6 +113,10 @@ lists the indices that LanceDb supports. ::: lancedb.index.BTree +::: lancedb.index.Bitmap + +::: lancedb.index.LabelList + ::: lancedb.index.IvfPq ## Querying (Asynchronous) diff --git a/python/python/lancedb/index.py b/python/python/lancedb/index.py index 00d51b35..f9dd7900 100644 --- a/python/python/lancedb/index.py +++ b/python/python/lancedb/index.py @@ -8,7 +8,7 @@ from ._lancedb import ( ) -class BTree(object): +class BTree: """Describes a btree index configuration A btree index is an index on scalar columns. The index stores a copy of the @@ -22,7 +22,8 @@ class BTree(object): sizeof(Scalar) * 4096 bytes to find the correct row ids. This index is good for scalar columns with mostly distinct values and does best - when the query is highly selective. + when the query is highly selective. It works with numeric, temporal, and string + columns. The btree index does not currently have any parameters though parameters such as the block size may be added in the future. @@ -32,7 +33,44 @@ class BTree(object): self._inner = LanceDbIndex.btree() -class IvfPq(object): +class Bitmap: + """Describe a Bitmap index configuration. + + A `Bitmap` index stores a bitmap for each distinct value in the column for + every row. + + This index works best for low-cardinality numeric or string columns, + where the number of unique values is small (i.e., less than a few thousands). + `Bitmap` index can accelerate the following filters: + + - `<`, `<=`, `=`, `>`, `>=` + - `IN (value1, value2, ...)` + - `between (value1, value2)` + - `is null` + + For example, a bitmap index with a table with 1Bi rows, and 128 distinct values, + requires 128 / 8 * 1Bi bytes on disk. + """ + + def __init__(self): + self._inner = LanceDbIndex.bitmap() + + +class LabelList: + """Describe a LabelList index configuration. + + `LabelList` is a scalar index that can be used on `List` columns to + support queries with `array_contains_all` and `array_contains_any` + using an underlying bitmap index. + + For example, it works with `tags`, `categories`, `keywords`, etc. + """ + + def __init__(self): + self._inner = LanceDbIndex.label_list() + + +class IvfPq: """Describes an IVF PQ Index This index stores a compressed (quantized) copy of every vector. These vectors diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 4e60eb7d..0edb22ed 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -1,15 +1,5 @@ -# Copyright 2023 LanceDB Developers -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The LanceDB Authors from __future__ import annotations @@ -61,7 +51,7 @@ if TYPE_CHECKING: from lance.dataset import CleanupStats, ReaderLike from ._lancedb import Table as LanceDBTable, OptimizeStats from .db import LanceDBConnection - from .index import BTree, IndexConfig, IvfPq + from .index import BTree, IndexConfig, IvfPq, Bitmap, LabelList pd = safe_import_pandas() @@ -2123,7 +2113,7 @@ class AsyncTable: column: str, *, replace: Optional[bool] = None, - config: Optional[Union[IvfPq, BTree]] = None, + config: Optional[Union[IvfPq, BTree, Bitmap, LabelList]] = None, ): """Create an index to speed up queries diff --git a/python/python/tests/test_index.py b/python/python/tests/test_index.py index 8e5a4053..283ffd27 100644 --- a/python/python/tests/test_index.py +++ b/python/python/tests/test_index.py @@ -1,10 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The LanceDB Authors + from datetime import timedelta +import random import pyarrow as pa import pytest import pytest_asyncio from lancedb import AsyncConnection, AsyncTable, connect_async -from lancedb.index import BTree, IvfPq +from lancedb.index import BTree, IvfPq, Bitmap, LabelList @pytest_asyncio.fixture @@ -25,8 +29,11 @@ NROWS = 256 async def some_table(db_async): data = pa.Table.from_pydict( { - "id": list(range(256)), + "id": list(range(NROWS)), "vector": sample_fixed_size_list_array(NROWS, DIM), + "tags": [ + [f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS) + ], } ) return await db_async.create_table( @@ -53,6 +60,22 @@ async def test_create_scalar_index(some_table: AsyncTable): await some_table.create_index("id", config=BTree()) +@pytest.mark.asyncio +async def test_create_bitmap_index(some_table: AsyncTable): + await some_table.create_index("id", config=Bitmap()) + # TODO: Fix via https://github.com/lancedb/lance/issues/2039 + # indices = await some_table.list_indices() + # assert str(indices) == '[Index(Bitmap, columns=["id"])]' + + +@pytest.mark.asyncio +async def test_create_label_list_index(some_table: AsyncTable): + await some_table.create_index("tags", config=LabelList()) + # TODO: Fix via https://github.com/lancedb/lance/issues/2039 + # indices = await some_table.list_indices() + # assert str(indices) == '[Index(LabelList, columns=["id"])]' + + @pytest.mark.asyncio async def test_create_vector_index(some_table: AsyncTable): # Can create diff --git a/python/src/index.rs b/python/src/index.rs index d10524a0..884b2987 100644 --- a/python/src/index.rs +++ b/python/src/index.rs @@ -84,6 +84,20 @@ impl Index { inner: Mutex::new(Some(LanceDbIndex::BTree(BTreeIndexBuilder::default()))), }) } + + #[staticmethod] + pub fn bitmap() -> PyResult { + Ok(Self { + inner: Mutex::new(Some(LanceDbIndex::Bitmap(Default::default()))), + }) + } + + #[staticmethod] + pub fn label_list() -> PyResult { + Ok(Self { + inner: Mutex::new(Some(LanceDbIndex::LabelList(Default::default()))), + }) + } } #[pyclass(get_all)] diff --git a/rust/lancedb/src/index.rs b/rust/lancedb/src/index.rs index 949af0be..4540b427 100644 --- a/rust/lancedb/src/index.rs +++ b/rust/lancedb/src/index.rs @@ -21,19 +21,47 @@ use serde_with::skip_serializing_none; use crate::{table::TableInternal, Result}; use self::{ - scalar::BTreeIndexBuilder, + scalar::{BTreeIndexBuilder, BitmapIndexBuilder, LabelListIndexBuilder}, vector::{IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder}, }; pub mod scalar; pub mod vector; +/// Supported index types. pub enum Index { Auto, + /// A `BTree` index is an sorted index on scalar columns. + /// This index is good for scalar columns with mostly distinct values and does best when + /// the query is highly selective. It can apply to numeric, temporal, and string columns. + /// + /// BTree index is useful to answer queries with + /// equality (`=`), inequality (`>`, `>=`, `<`, `<=`),and range queries. + /// + /// This is the default index type for scalar columns. BTree(BTreeIndexBuilder), + + /// A `Bitmap` index stores a bitmap for each distinct value in the column for every row. + /// + /// This index works best for low-cardinality columns, + /// where the number of unique values is small (i.e., less than a few hundreds). + Bitmap(BitmapIndexBuilder), + + /// [LabelListIndexBuilder] is a scalar index that can be used on `List` columns to + /// support queries with `array_contains_all` and `array_contains_any` + /// using an underlying bitmap index. + LabelList(LabelListIndexBuilder), + + /// Full text search index using bm25. FTS(FtsIndexBuilder), + + /// IVF index with Product Quantization IvfPq(IvfPqIndexBuilder), + + /// IVF-HNSW index with Product Quantization IvfHnswPq(IvfHnswPqIndexBuilder), + + /// IVF-HNSW index with Scalar Quantization IvfHnswSq(IvfHnswSqIndexBuilder), } @@ -74,10 +102,14 @@ impl IndexBuilder { #[derive(Debug, Clone, PartialEq)] pub enum IndexType { + // Vector IvfPq, IvfHnswPq, IvfHnswSq, + // Scalar BTree, + Bitmap, + LabelList, } /// A description of an index currently configured on a column diff --git a/rust/lancedb/src/index/scalar.rs b/rust/lancedb/src/index/scalar.rs index 9623efe3..cf1e695e 100644 --- a/rust/lancedb/src/index/scalar.rs +++ b/rust/lancedb/src/index/scalar.rs @@ -29,6 +29,25 @@ pub struct BTreeIndexBuilder {} impl BTreeIndexBuilder {} +/// Builder for a Bitmap index. +/// +/// It is a scalar index that stores a bitmap for each possible value +/// +/// This index works best for low-cardinality (i.e., less than 1000 unique values) columns, +/// where the number of unique values is small. +/// The bitmap stores a list of row ids where the value is present. +#[derive(Debug, Clone, Default)] +pub struct BitmapIndexBuilder {} + +/// Builder for LabelList index. +/// +/// [LabeListIndexBuilder] is a scalar index that can be used on `List` columns to +/// support queries with `array_contains_all` and `array_contains_any` +/// using an underlying bitmap index. +/// +#[derive(Debug, Clone, Default)] +pub struct LabelListIndexBuilder {} + /// Builder for a full text search index /// /// A full text search index is an index on a string column that allows for full text search diff --git a/rust/lancedb/src/lib.rs b/rust/lancedb/src/lib.rs index 2b2d9469..0b7775bd 100644 --- a/rust/lancedb/src/lib.rs +++ b/rust/lancedb/src/lib.rs @@ -13,7 +13,7 @@ // limitations under the License. //! [LanceDB](https://github.com/lancedb/lancedb) is an open-source database for vector-search built with persistent storage, -//! which greatly simplifies retrevial, filtering and management of embeddings. +//! which greatly simplifies retrieval, filtering and management of embeddings. //! //! The key features of LanceDB include: //! - Production-scale vector search with no servers to manage. @@ -133,6 +133,13 @@ //! //! #### Create vector index (IVF_PQ) //! +//! LanceDB is capable to automatically create appropriate indices based on the data types +//! of the columns. For example, +//! +//! * If a column has a data type of `FixedSizeList`, +//! LanceDB will create a `IVF-PQ` vector index with default parameters. +//! * Otherwise, it creates a `BTree` index by default. +//! //! ```no_run //! # use std::sync::Arc; //! # use arrow_array::{FixedSizeListArray, types::Float32Type, RecordBatch, @@ -150,7 +157,10 @@ //! # }); //! ``` //! -//! #### Open table and run search +//! +//! User can also specify the index type explicitly, see [`Table::create_index`]. +//! +//! #### Open table and search //! //! ```rust //! # use std::sync::Arc; diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index d716643c..12254819 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -573,7 +573,8 @@ impl Table { /// There are a variety of indices available. They are described more in /// [`crate::index::Index`]. The simplest thing to do is to use `index::Index::Auto` which /// will attempt to create the most useful index based on the column type and column - /// statistics. + /// statistics. `BTree` index is created by default for numeric, temporal, and + /// string columns. /// /// Once an index is created it will remain until the data is overwritten (e.g. an /// add operation with mode overwrite) or the indexed column is dropped. @@ -607,10 +608,21 @@ impl Table { /// .await /// .unwrap(); /// # let tbl = db.open_table("idx_test").execute().await.unwrap(); + /// // Create IVF PQ index on the "vector" column by default. /// tbl.create_index(&["vector"], Index::Auto) /// .execute() /// .await /// .unwrap(); + /// // Create a BTree index on the "id" column. + /// tbl.create_index(&["id"], Index::Auto) + /// .execute() + /// .await + /// .unwrap(); + /// // Create a LabelList index on the "tags" column. + /// tbl.create_index(&["tags"], Index::LabelList(Default::default())) + /// .execute() + /// .await + /// .unwrap(); /// # }); /// ``` pub fn create_index(&self, columns: &[impl AsRef], index: Index) -> IndexBuilder { @@ -1054,6 +1066,20 @@ impl NativeTable { ) } + fn supported_bitmap_data_type(dtype: &DataType) -> bool { + dtype.is_integer() || matches!(dtype, DataType::Utf8) + } + + fn supported_label_list_data_type(dtype: &DataType) -> bool { + match dtype { + DataType::List(field) => Self::supported_bitmap_data_type(field.data_type()), + DataType::FixedSizeList(field, _) => { + Self::supported_bitmap_data_type(field.data_type()) + } + _ => false, + } + } + fn supported_fts_data_type(dtype: &DataType) -> bool { matches!(dtype, DataType::Utf8 | DataType::LargeUtf8) } @@ -1519,7 +1545,61 @@ impl NativeTable { dataset .create_index( &[field.name()], - IndexType::Scalar, + IndexType::BTree, + None, + &lance_idx_params, + opts.replace, + ) + .await?; + Ok(()) + } + + async fn create_bitmap_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> { + if !Self::supported_bitmap_data_type(field.data_type()) { + return Err(Error::Schema { + message: format!( + "A Bitmap index cannot be created on the field `{}` which has data type {}", + field.name(), + field.data_type() + ), + }); + } + + let mut dataset = self.dataset.get_mut().await?; + let lance_idx_params = lance_index::scalar::ScalarIndexParams { + force_index_type: Some(lance_index::scalar::ScalarIndexType::Bitmap), + }; + dataset + .create_index( + &[field.name()], + IndexType::Bitmap, + None, + &lance_idx_params, + opts.replace, + ) + .await?; + Ok(()) + } + + async fn create_label_list_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> { + if !Self::supported_label_list_data_type(field.data_type()) { + return Err(Error::Schema { + message: format!( + "A LabelList index cannot be created on the field `{}` which has data type {}", + field.name(), + field.data_type() + ), + }); + } + + let mut dataset = self.dataset.get_mut().await?; + let lance_idx_params = lance_index::scalar::ScalarIndexParams { + force_index_type: Some(lance_index::scalar::ScalarIndexType::LabelList), + }; + dataset + .create_index( + &[field.name()], + IndexType::LabelList, None, &lance_idx_params, opts.replace, @@ -1690,6 +1770,8 @@ impl TableInternal for NativeTable { match opts.index { Index::Auto => self.create_auto_index(field, opts).await, Index::BTree(_) => self.create_btree_index(field, opts).await, + Index::Bitmap(_) => self.create_bitmap_index(field, opts).await, + Index::LabelList(_) => self.create_label_list_index(field, opts).await, Index::FTS(_) => self.create_fts_index(field, opts).await, Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await, Index::IvfHnswPq(ivf_hnsw_pq) => { @@ -2013,6 +2095,7 @@ mod tests { use std::time::Duration; use arrow_array::{ + builder::{ListBuilder, StringBuilder}, Array, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeStringArray, RecordBatch, RecordBatchIterator, RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray, @@ -2022,17 +2105,17 @@ mod tests { use arrow_schema::{DataType, Field, Schema, TimeUnit}; use futures::TryStreamExt; use lance::dataset::{Dataset, WriteMode}; + use lance::index::DatasetIndexInternalExt; use lance::io::{ObjectStoreParams, WrappingObjectStore}; use rand::Rng; use tempfile::tempdir; + use super::*; use crate::connect; use crate::connection::ConnectBuilder; use crate::index::scalar::BTreeIndexBuilder; use crate::query::{ExecutableQuery, QueryBase}; - use super::*; - #[tokio::test] async fn test_open() { let tmp_dir = tempdir().unwrap(); @@ -2997,6 +3080,151 @@ mod tests { ); } + #[tokio::test] + async fn test_create_bitmap_index() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + + let conn = ConnectBuilder::new(uri).execute().await.unwrap(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("category", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| format!("category_{}", i % 5)), + )), + ], + ) + .unwrap(); + + let table = conn + .create_table( + "test_bitmap", + RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()), + ) + .execute() + .await + .unwrap(); + + // Create bitmap index on the "category" column + table + .create_index(&["category"], Index::Bitmap(Default::default())) + .execute() + .await + .unwrap(); + + // Verify the index was created + let index_configs = table.list_indices().await.unwrap(); + assert_eq!(index_configs.len(), 1); + let index = index_configs.into_iter().next().unwrap(); + // TODO: Fix via https://github.com/lancedb/lance/issues/2039 + // assert_eq!(index.index_type, crate::index::IndexType::Bitmap); + assert_eq!(index.columns, vec!["category".to_string()]); + + // For now, just open the index to verify its type + let lance_dataset = table.as_native().unwrap().dataset.get().await.unwrap(); + let indices = lance_dataset + .load_indices_by_name(&index.name) + .await + .unwrap(); + let index_meta = &indices[0]; + let idx = lance_dataset + .open_scalar_index("category", &index_meta.uuid.to_string()) + .await + .unwrap(); + assert_eq!(idx.index_type(), IndexType::Bitmap); + } + + #[tokio::test] + async fn test_create_label_list_index() { + let tmp_dir = tempdir().unwrap(); + let uri = tmp_dir.path().to_str().unwrap(); + + let conn = ConnectBuilder::new(uri).execute().await.unwrap(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "tags", + DataType::List(Field::new("item", DataType::Utf8, true).into()), + true, + ), + ])); + + const TAGS: [&str; 3] = ["cat", "dog", "fish"]; + + let values_builder = StringBuilder::new(); + let mut builder = ListBuilder::new(values_builder); + for i in 0..120 { + builder.values().append_value(TAGS[i % 3].to_string()); + if i % 3 == 0 { + builder.append(true) + } + } + let tags = Arc::new(builder.finish()); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..40)), tags], + ) + .unwrap(); + + let table = conn + .create_table( + "test_bitmap", + RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()), + ) + .execute() + .await + .unwrap(); + + // Can not create btree or bitmap index on list column + assert!(table + .create_index(&["tags"], Index::BTree(Default::default())) + .execute() + .await + .is_err()); + assert!(table + .create_index(&["tags"], Index::Bitmap(Default::default())) + .execute() + .await + .is_err()); + + // Create bitmap index on the "category" column + table + .create_index(&["tags"], Index::LabelList(Default::default())) + .execute() + .await + .unwrap(); + + // Verify the index was created + let index_configs = table.list_indices().await.unwrap(); + assert_eq!(index_configs.len(), 1); + let index = index_configs.into_iter().next().unwrap(); + // TODO: Fix via https://github.com/lancedb/lance/issues/2039 + // assert_eq!(index.index_type, crate::index::IndexType::LabelList); + assert_eq!(index.columns, vec!["tags".to_string()]); + + // For now, just open the index to verify its type + let lance_dataset = table.as_native().unwrap().dataset.get().await.unwrap(); + let indices = lance_dataset + .load_indices_by_name(&index.name) + .await + .unwrap(); + let index_meta = &indices[0]; + let idx = lance_dataset + .open_scalar_index("tags", &index_meta.uuid.to_string()) + .await + .unwrap(); + assert_eq!(idx.index_type(), IndexType::LabelList); + } + #[tokio::test] async fn test_read_consistency_interval() { let intervals = vec![