mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 14:49:57 +00:00
feat: allow bitmap indexes on large-string, binary, large-binary, and bitmap (#2678)
The underlying `pylance` already supported this, it was just blocked out by an over-eager validation function Closes #1981
This commit is contained in:
@@ -35,6 +35,8 @@ async def some_table(db_async):
|
|||||||
"tags": [
|
"tags": [
|
||||||
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
||||||
],
|
],
|
||||||
|
"is_active": [random.choice([True, False]) for _ in range(NROWS)],
|
||||||
|
"data": [random.randbytes(random.randint(0, 128)) for _ in range(NROWS)],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return await db_async.create_table(
|
return await db_async.create_table(
|
||||||
@@ -99,10 +101,17 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||||
await some_table.create_index("id", config=Bitmap())
|
await some_table.create_index("id", config=Bitmap())
|
||||||
|
await some_table.create_index("is_active", config=Bitmap())
|
||||||
|
await some_table.create_index("data", config=Bitmap())
|
||||||
indices = await some_table.list_indices()
|
indices = await some_table.list_indices()
|
||||||
assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]'
|
assert len(indices) == 3
|
||||||
indices = await some_table.list_indices()
|
assert indices[0].index_type == "Bitmap"
|
||||||
assert len(indices) == 1
|
assert indices[0].columns == ["id"]
|
||||||
|
assert indices[1].index_type == "Bitmap"
|
||||||
|
assert indices[1].columns == ["is_active"]
|
||||||
|
assert indices[2].index_type == "Bitmap"
|
||||||
|
assert indices[2].columns == ["data"]
|
||||||
|
|
||||||
index_name = indices[0].name
|
index_name = indices[0].name
|
||||||
stats = await some_table.index_stats(index_name)
|
stats = await some_table.index_stats(index_name)
|
||||||
assert stats.index_type == "BITMAP"
|
assert stats.index_type == "BITMAP"
|
||||||
@@ -111,6 +120,11 @@ async def test_create_bitmap_index(some_table: AsyncTable):
|
|||||||
assert stats.num_unindexed_rows == 0
|
assert stats.num_unindexed_rows == 0
|
||||||
assert stats.num_indices == 1
|
assert stats.num_indices == 1
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"ScalarIndexQuery"
|
||||||
|
in await some_table.query().where("is_active = TRUE").explain_plan()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_create_label_list_index(some_table: AsyncTable):
|
async def test_create_label_list_index(some_table: AsyncTable):
|
||||||
|
|||||||
@@ -2760,6 +2760,7 @@ mod tests {
|
|||||||
RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
||||||
UInt32Array,
|
UInt32Array,
|
||||||
};
|
};
|
||||||
|
use arrow_array::{BinaryArray, LargeBinaryArray};
|
||||||
use arrow_data::ArrayDataBuilder;
|
use arrow_data::ArrayDataBuilder;
|
||||||
use arrow_schema::{DataType, Field, Schema, TimeUnit};
|
use arrow_schema::{DataType, Field, Schema, TimeUnit};
|
||||||
use futures::TryStreamExt;
|
use futures::TryStreamExt;
|
||||||
@@ -3725,6 +3726,10 @@ mod tests {
|
|||||||
let schema = Arc::new(Schema::new(vec![
|
let schema = Arc::new(Schema::new(vec![
|
||||||
Field::new("id", DataType::Int32, false),
|
Field::new("id", DataType::Int32, false),
|
||||||
Field::new("category", DataType::Utf8, true),
|
Field::new("category", DataType::Utf8, true),
|
||||||
|
Field::new("large_category", DataType::LargeUtf8, true),
|
||||||
|
Field::new("is_active", DataType::Boolean, true),
|
||||||
|
Field::new("data", DataType::Binary, true),
|
||||||
|
Field::new("large_data", DataType::LargeBinary, true),
|
||||||
]));
|
]));
|
||||||
|
|
||||||
let batch = RecordBatch::try_new(
|
let batch = RecordBatch::try_new(
|
||||||
@@ -3734,6 +3739,16 @@ mod tests {
|
|||||||
Arc::new(StringArray::from_iter_values(
|
Arc::new(StringArray::from_iter_values(
|
||||||
(0..100).map(|i| format!("category_{}", i % 5)),
|
(0..100).map(|i| format!("category_{}", i % 5)),
|
||||||
)),
|
)),
|
||||||
|
Arc::new(LargeStringArray::from_iter_values(
|
||||||
|
(0..100).map(|i| format!("large_category_{}", i % 5)),
|
||||||
|
)),
|
||||||
|
Arc::new(BooleanArray::from_iter((0..100).map(|i| Some(i % 2 == 0)))),
|
||||||
|
Arc::new(BinaryArray::from_iter_values(
|
||||||
|
(0_u32..100).map(|i| i.to_le_bytes()),
|
||||||
|
)),
|
||||||
|
Arc::new(LargeBinaryArray::from_iter_values(
|
||||||
|
(0_u32..100).map(|i| i.to_le_bytes()),
|
||||||
|
)),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -3754,12 +3769,58 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
// Create bitmap index on the "is_active" column
|
||||||
|
table
|
||||||
|
.create_index(&["is_active"], Index::Bitmap(Default::default()))
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Create bitmap index on the "data" column
|
||||||
|
table
|
||||||
|
.create_index(&["data"], Index::Bitmap(Default::default()))
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Create bitmap index on the "large_data" column
|
||||||
|
table
|
||||||
|
.create_index(&["large_data"], Index::Bitmap(Default::default()))
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Create bitmap index on the "large_category" column
|
||||||
|
table
|
||||||
|
.create_index(&["large_category"], Index::Bitmap(Default::default()))
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
// Verify the index was created
|
// Verify the index was created
|
||||||
let index_configs = table.list_indices().await.unwrap();
|
let index_configs = table.list_indices().await.unwrap();
|
||||||
assert_eq!(index_configs.len(), 1);
|
assert_eq!(index_configs.len(), 5);
|
||||||
let index = index_configs.into_iter().next().unwrap();
|
|
||||||
|
let mut configs_iter = index_configs.into_iter();
|
||||||
|
let index = configs_iter.next().unwrap();
|
||||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||||
assert_eq!(index.columns, vec!["category".to_string()]);
|
assert_eq!(index.columns, vec!["category".to_string()]);
|
||||||
|
|
||||||
|
let index = configs_iter.next().unwrap();
|
||||||
|
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||||
|
assert_eq!(index.columns, vec!["is_active".to_string()]);
|
||||||
|
|
||||||
|
let index = configs_iter.next().unwrap();
|
||||||
|
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||||
|
assert_eq!(index.columns, vec!["data".to_string()]);
|
||||||
|
|
||||||
|
let index = configs_iter.next().unwrap();
|
||||||
|
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||||
|
assert_eq!(index.columns, vec!["large_data".to_string()]);
|
||||||
|
|
||||||
|
let index = configs_iter.next().unwrap();
|
||||||
|
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||||
|
assert_eq!(index.columns, vec!["large_category".to_string()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -195,7 +195,15 @@ pub fn supported_btree_data_type(dtype: &DataType) -> bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn supported_bitmap_data_type(dtype: &DataType) -> bool {
|
pub fn supported_bitmap_data_type(dtype: &DataType) -> bool {
|
||||||
dtype.is_integer() || matches!(dtype, DataType::Utf8)
|
dtype.is_integer()
|
||||||
|
|| matches!(
|
||||||
|
dtype,
|
||||||
|
DataType::Utf8
|
||||||
|
| DataType::LargeUtf8
|
||||||
|
| DataType::Binary
|
||||||
|
| DataType::LargeBinary
|
||||||
|
| DataType::Boolean
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn supported_label_list_data_type(dtype: &DataType) -> bool {
|
pub fn supported_label_list_data_type(dtype: &DataType) -> bool {
|
||||||
|
|||||||
Reference in New Issue
Block a user