feat: allow bitmap indexes on large-string, binary, large-binary, and bitmap (#2678)

The underlying `pylance` already supported this, it was just blocked out
by an over-eager validation function

Closes #1981
This commit is contained in:
Weston Pace
2025-09-25 09:46:42 -07:00
committed by GitHub
parent e7e9e80b1d
commit e07389a36c
3 changed files with 89 additions and 6 deletions

View File

@@ -35,6 +35,8 @@ async def some_table(db_async):
"tags": [
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
],
"is_active": [random.choice([True, False]) for _ in range(NROWS)],
"data": [random.randbytes(random.randint(0, 128)) for _ in range(NROWS)],
}
)
return await db_async.create_table(
@@ -99,10 +101,17 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
@pytest.mark.asyncio
async def test_create_bitmap_index(some_table: AsyncTable):
await some_table.create_index("id", config=Bitmap())
await some_table.create_index("is_active", config=Bitmap())
await some_table.create_index("data", config=Bitmap())
indices = await some_table.list_indices()
assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]'
indices = await some_table.list_indices()
assert len(indices) == 1
assert len(indices) == 3
assert indices[0].index_type == "Bitmap"
assert indices[0].columns == ["id"]
assert indices[1].index_type == "Bitmap"
assert indices[1].columns == ["is_active"]
assert indices[2].index_type == "Bitmap"
assert indices[2].columns == ["data"]
index_name = indices[0].name
stats = await some_table.index_stats(index_name)
assert stats.index_type == "BITMAP"
@@ -111,6 +120,11 @@ async def test_create_bitmap_index(some_table: AsyncTable):
assert stats.num_unindexed_rows == 0
assert stats.num_indices == 1
assert (
"ScalarIndexQuery"
in await some_table.query().where("is_active = TRUE").explain_plan()
)
@pytest.mark.asyncio
async def test_create_label_list_index(some_table: AsyncTable):

View File

@@ -2760,6 +2760,7 @@ mod tests {
RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray,
UInt32Array,
};
use arrow_array::{BinaryArray, LargeBinaryArray};
use arrow_data::ArrayDataBuilder;
use arrow_schema::{DataType, Field, Schema, TimeUnit};
use futures::TryStreamExt;
@@ -3725,6 +3726,10 @@ mod tests {
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("category", DataType::Utf8, true),
Field::new("large_category", DataType::LargeUtf8, true),
Field::new("is_active", DataType::Boolean, true),
Field::new("data", DataType::Binary, true),
Field::new("large_data", DataType::LargeBinary, true),
]));
let batch = RecordBatch::try_new(
@@ -3734,6 +3739,16 @@ mod tests {
Arc::new(StringArray::from_iter_values(
(0..100).map(|i| format!("category_{}", i % 5)),
)),
Arc::new(LargeStringArray::from_iter_values(
(0..100).map(|i| format!("large_category_{}", i % 5)),
)),
Arc::new(BooleanArray::from_iter((0..100).map(|i| Some(i % 2 == 0)))),
Arc::new(BinaryArray::from_iter_values(
(0_u32..100).map(|i| i.to_le_bytes()),
)),
Arc::new(LargeBinaryArray::from_iter_values(
(0_u32..100).map(|i| i.to_le_bytes()),
)),
],
)
.unwrap();
@@ -3754,12 +3769,58 @@ mod tests {
.await
.unwrap();
// Create bitmap index on the "is_active" column
table
.create_index(&["is_active"], Index::Bitmap(Default::default()))
.execute()
.await
.unwrap();
// Create bitmap index on the "data" column
table
.create_index(&["data"], Index::Bitmap(Default::default()))
.execute()
.await
.unwrap();
// Create bitmap index on the "large_data" column
table
.create_index(&["large_data"], Index::Bitmap(Default::default()))
.execute()
.await
.unwrap();
// Create bitmap index on the "large_category" column
table
.create_index(&["large_category"], Index::Bitmap(Default::default()))
.execute()
.await
.unwrap();
// Verify the index was created
let index_configs = table.list_indices().await.unwrap();
assert_eq!(index_configs.len(), 1);
let index = index_configs.into_iter().next().unwrap();
assert_eq!(index_configs.len(), 5);
let mut configs_iter = index_configs.into_iter();
let index = configs_iter.next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
assert_eq!(index.columns, vec!["category".to_string()]);
let index = configs_iter.next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
assert_eq!(index.columns, vec!["is_active".to_string()]);
let index = configs_iter.next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
assert_eq!(index.columns, vec!["data".to_string()]);
let index = configs_iter.next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
assert_eq!(index.columns, vec!["large_data".to_string()]);
let index = configs_iter.next().unwrap();
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
assert_eq!(index.columns, vec!["large_category".to_string()]);
}
#[tokio::test]

View File

@@ -195,7 +195,15 @@ pub fn supported_btree_data_type(dtype: &DataType) -> bool {
}
pub fn supported_bitmap_data_type(dtype: &DataType) -> bool {
dtype.is_integer() || matches!(dtype, DataType::Utf8)
dtype.is_integer()
|| matches!(
dtype,
DataType::Utf8
| DataType::LargeUtf8
| DataType::Binary
| DataType::LargeBinary
| DataType::Boolean
)
}
pub fn supported_label_list_data_type(dtype: &DataType) -> bool {