mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-22 21:09:58 +00:00
feat: allow bitmap indexes on large-string, binary, large-binary, and bitmap (#2678)
The underlying `pylance` already supported this, it was just blocked out by an over-eager validation function Closes #1981
This commit is contained in:
@@ -35,6 +35,8 @@ async def some_table(db_async):
|
||||
"tags": [
|
||||
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
||||
],
|
||||
"is_active": [random.choice([True, False]) for _ in range(NROWS)],
|
||||
"data": [random.randbytes(random.randint(0, 128)) for _ in range(NROWS)],
|
||||
}
|
||||
)
|
||||
return await db_async.create_table(
|
||||
@@ -99,10 +101,17 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
await some_table.create_index("id", config=Bitmap())
|
||||
await some_table.create_index("is_active", config=Bitmap())
|
||||
await some_table.create_index("data", config=Bitmap())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]'
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert len(indices) == 3
|
||||
assert indices[0].index_type == "Bitmap"
|
||||
assert indices[0].columns == ["id"]
|
||||
assert indices[1].index_type == "Bitmap"
|
||||
assert indices[1].columns == ["is_active"]
|
||||
assert indices[2].index_type == "Bitmap"
|
||||
assert indices[2].columns == ["data"]
|
||||
|
||||
index_name = indices[0].name
|
||||
stats = await some_table.index_stats(index_name)
|
||||
assert stats.index_type == "BITMAP"
|
||||
@@ -111,6 +120,11 @@ async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
|
||||
assert (
|
||||
"ScalarIndexQuery"
|
||||
in await some_table.query().where("is_active = TRUE").explain_plan()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_label_list_index(some_table: AsyncTable):
|
||||
|
||||
@@ -2760,6 +2760,7 @@ mod tests {
|
||||
RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
||||
UInt32Array,
|
||||
};
|
||||
use arrow_array::{BinaryArray, LargeBinaryArray};
|
||||
use arrow_data::ArrayDataBuilder;
|
||||
use arrow_schema::{DataType, Field, Schema, TimeUnit};
|
||||
use futures::TryStreamExt;
|
||||
@@ -3725,6 +3726,10 @@ mod tests {
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("category", DataType::Utf8, true),
|
||||
Field::new("large_category", DataType::LargeUtf8, true),
|
||||
Field::new("is_active", DataType::Boolean, true),
|
||||
Field::new("data", DataType::Binary, true),
|
||||
Field::new("large_data", DataType::LargeBinary, true),
|
||||
]));
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
@@ -3734,6 +3739,16 @@ mod tests {
|
||||
Arc::new(StringArray::from_iter_values(
|
||||
(0..100).map(|i| format!("category_{}", i % 5)),
|
||||
)),
|
||||
Arc::new(LargeStringArray::from_iter_values(
|
||||
(0..100).map(|i| format!("large_category_{}", i % 5)),
|
||||
)),
|
||||
Arc::new(BooleanArray::from_iter((0..100).map(|i| Some(i % 2 == 0)))),
|
||||
Arc::new(BinaryArray::from_iter_values(
|
||||
(0_u32..100).map(|i| i.to_le_bytes()),
|
||||
)),
|
||||
Arc::new(LargeBinaryArray::from_iter_values(
|
||||
(0_u32..100).map(|i| i.to_le_bytes()),
|
||||
)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
@@ -3754,12 +3769,58 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "is_active" column
|
||||
table
|
||||
.create_index(&["is_active"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "data" column
|
||||
table
|
||||
.create_index(&["data"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "large_data" column
|
||||
table
|
||||
.create_index(&["large_data"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "large_category" column
|
||||
table
|
||||
.create_index(&["large_category"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify the index was created
|
||||
let index_configs = table.list_indices().await.unwrap();
|
||||
assert_eq!(index_configs.len(), 1);
|
||||
let index = index_configs.into_iter().next().unwrap();
|
||||
assert_eq!(index_configs.len(), 5);
|
||||
|
||||
let mut configs_iter = index_configs.into_iter();
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["category".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["is_active".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["data".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["large_data".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["large_category".to_string()]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -195,7 +195,15 @@ pub fn supported_btree_data_type(dtype: &DataType) -> bool {
|
||||
}
|
||||
|
||||
pub fn supported_bitmap_data_type(dtype: &DataType) -> bool {
|
||||
dtype.is_integer() || matches!(dtype, DataType::Utf8)
|
||||
dtype.is_integer()
|
||||
|| matches!(
|
||||
dtype,
|
||||
DataType::Utf8
|
||||
| DataType::LargeUtf8
|
||||
| DataType::Binary
|
||||
| DataType::LargeBinary
|
||||
| DataType::Boolean
|
||||
)
|
||||
}
|
||||
|
||||
pub fn supported_label_list_data_type(dtype: &DataType) -> bool {
|
||||
|
||||
Reference in New Issue
Block a user