From e07389a36c33e3fbeb2395ff48e306b0086a25b4 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 25 Sep 2025 09:46:42 -0700 Subject: [PATCH] feat: allow bitmap indexes on large-string, binary, large-binary, and bitmap (#2678) The underlying `pylance` already supported this, it was just blocked out by an over-eager validation function Closes #1981 --- python/python/tests/test_index.py | 20 ++++++++-- rust/lancedb/src/table.rs | 65 ++++++++++++++++++++++++++++++- rust/lancedb/src/utils.rs | 10 ++++- 3 files changed, 89 insertions(+), 6 deletions(-) diff --git a/python/python/tests/test_index.py b/python/python/tests/test_index.py index 781fb710..51e7c378 100644 --- a/python/python/tests/test_index.py +++ b/python/python/tests/test_index.py @@ -35,6 +35,8 @@ async def some_table(db_async): "tags": [ [f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS) ], + "is_active": [random.choice([True, False]) for _ in range(NROWS)], + "data": [random.randbytes(random.randint(0, 128)) for _ in range(NROWS)], } ) return await db_async.create_table( @@ -99,10 +101,17 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable): @pytest.mark.asyncio async def test_create_bitmap_index(some_table: AsyncTable): await some_table.create_index("id", config=Bitmap()) + await some_table.create_index("is_active", config=Bitmap()) + await some_table.create_index("data", config=Bitmap()) indices = await some_table.list_indices() - assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]' - indices = await some_table.list_indices() - assert len(indices) == 1 + assert len(indices) == 3 + assert indices[0].index_type == "Bitmap" + assert indices[0].columns == ["id"] + assert indices[1].index_type == "Bitmap" + assert indices[1].columns == ["is_active"] + assert indices[2].index_type == "Bitmap" + assert indices[2].columns == ["data"] + index_name = indices[0].name stats = await some_table.index_stats(index_name) assert stats.index_type == "BITMAP" @@ -111,6 +120,11 @@ async def test_create_bitmap_index(some_table: AsyncTable): assert stats.num_unindexed_rows == 0 assert stats.num_indices == 1 + assert ( + "ScalarIndexQuery" + in await some_table.query().where("is_active = TRUE").explain_plan() + ) + @pytest.mark.asyncio async def test_create_label_list_index(some_table: AsyncTable): diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index d31ee2c3..0a400e23 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -2760,6 +2760,7 @@ mod tests { RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray, UInt32Array, }; + use arrow_array::{BinaryArray, LargeBinaryArray}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType, Field, Schema, TimeUnit}; use futures::TryStreamExt; @@ -3725,6 +3726,10 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), Field::new("category", DataType::Utf8, true), + Field::new("large_category", DataType::LargeUtf8, true), + Field::new("is_active", DataType::Boolean, true), + Field::new("data", DataType::Binary, true), + Field::new("large_data", DataType::LargeBinary, true), ])); let batch = RecordBatch::try_new( @@ -3734,6 +3739,16 @@ mod tests { Arc::new(StringArray::from_iter_values( (0..100).map(|i| format!("category_{}", i % 5)), )), + Arc::new(LargeStringArray::from_iter_values( + (0..100).map(|i| format!("large_category_{}", i % 5)), + )), + Arc::new(BooleanArray::from_iter((0..100).map(|i| Some(i % 2 == 0)))), + Arc::new(BinaryArray::from_iter_values( + (0_u32..100).map(|i| i.to_le_bytes()), + )), + Arc::new(LargeBinaryArray::from_iter_values( + (0_u32..100).map(|i| i.to_le_bytes()), + )), ], ) .unwrap(); @@ -3754,12 +3769,58 @@ mod tests { .await .unwrap(); + // Create bitmap index on the "is_active" column + table + .create_index(&["is_active"], Index::Bitmap(Default::default())) + .execute() + .await + .unwrap(); + + // Create bitmap index on the "data" column + table + .create_index(&["data"], Index::Bitmap(Default::default())) + .execute() + .await + .unwrap(); + + // Create bitmap index on the "large_data" column + table + .create_index(&["large_data"], Index::Bitmap(Default::default())) + .execute() + .await + .unwrap(); + + // Create bitmap index on the "large_category" column + table + .create_index(&["large_category"], Index::Bitmap(Default::default())) + .execute() + .await + .unwrap(); + // Verify the index was created let index_configs = table.list_indices().await.unwrap(); - assert_eq!(index_configs.len(), 1); - let index = index_configs.into_iter().next().unwrap(); + assert_eq!(index_configs.len(), 5); + + let mut configs_iter = index_configs.into_iter(); + let index = configs_iter.next().unwrap(); assert_eq!(index.index_type, crate::index::IndexType::Bitmap); assert_eq!(index.columns, vec!["category".to_string()]); + + let index = configs_iter.next().unwrap(); + assert_eq!(index.index_type, crate::index::IndexType::Bitmap); + assert_eq!(index.columns, vec!["is_active".to_string()]); + + let index = configs_iter.next().unwrap(); + assert_eq!(index.index_type, crate::index::IndexType::Bitmap); + assert_eq!(index.columns, vec!["data".to_string()]); + + let index = configs_iter.next().unwrap(); + assert_eq!(index.index_type, crate::index::IndexType::Bitmap); + assert_eq!(index.columns, vec!["large_data".to_string()]); + + let index = configs_iter.next().unwrap(); + assert_eq!(index.index_type, crate::index::IndexType::Bitmap); + assert_eq!(index.columns, vec!["large_category".to_string()]); } #[tokio::test] diff --git a/rust/lancedb/src/utils.rs b/rust/lancedb/src/utils.rs index 02614958..6ceffa14 100644 --- a/rust/lancedb/src/utils.rs +++ b/rust/lancedb/src/utils.rs @@ -195,7 +195,15 @@ pub fn supported_btree_data_type(dtype: &DataType) -> bool { } pub fn supported_bitmap_data_type(dtype: &DataType) -> bool { - dtype.is_integer() || matches!(dtype, DataType::Utf8) + dtype.is_integer() + || matches!( + dtype, + DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Binary + | DataType::LargeBinary + | DataType::Boolean + ) } pub fn supported_label_list_data_type(dtype: &DataType) -> bool {