From e714f7df6c188fed9b05006eb5ae93e4cc3477d9 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Mon, 3 Mar 2025 17:53:14 +0800 Subject: [PATCH] fix: out of bound during bloom search (#5625) Signed-off-by: Zhenchi --- src/index/src/bloom_filter/applier.rs | 11 +++++++- src/index/src/bloom_filter/creator.rs | 39 ++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/index/src/bloom_filter/applier.rs b/src/index/src/bloom_filter/applier.rs index e87a94cd1b..8829f4e0ee 100644 --- a/src/index/src/bloom_filter/applier.rs +++ b/src/index/src/bloom_filter/applier.rs @@ -42,7 +42,16 @@ impl BloomFilterApplier { ) -> Result>> { let rows_per_segment = self.meta.rows_per_segment as usize; let start_seg = search_range.start / rows_per_segment; - let end_seg = search_range.end.div_ceil(rows_per_segment); + let mut end_seg = search_range.end.div_ceil(rows_per_segment); + + if end_seg == self.meta.segment_loc_indices.len() + 1 { + // In a previous version, there was a bug where if the last segment was all null, + // this segment would not be written into the index. This caused the slice + // `self.meta.segment_loc_indices[start_seg..end_seg]` to go out of bounds due to + // the missing segment. Since the `search` function does not search for nulls, + // we can simply ignore the last segment in this buggy scenario. + end_seg -= 1; + } let locs = &self.meta.segment_loc_indices[start_seg..end_seg]; diff --git a/src/index/src/bloom_filter/creator.rs b/src/index/src/bloom_filter/creator.rs index 0b6810a688..66e892e29f 100644 --- a/src/index/src/bloom_filter/creator.rs +++ b/src/index/src/bloom_filter/creator.rs @@ -64,6 +64,9 @@ pub struct BloomFilterCreator { /// Storage for finalized Bloom filters. finalized_bloom_filters: FinalizedBloomFilterStorage, + /// Row count that finalized so far. + finalized_row_count: usize, + /// Global memory usage of the bloom filter creator. global_memory_usage: Arc, } @@ -96,6 +99,7 @@ impl BloomFilterCreator { global_memory_usage, global_memory_usage_threshold, ), + finalized_row_count: 0, } } @@ -136,6 +140,7 @@ impl BloomFilterCreator { if self.accumulated_row_count % self.rows_per_segment == 0 { self.finalize_segment().await?; + self.finalized_row_count = self.accumulated_row_count; } } @@ -161,6 +166,7 @@ impl BloomFilterCreator { if self.accumulated_row_count % self.rows_per_segment == 0 { self.finalize_segment().await?; + self.finalized_row_count = self.accumulated_row_count; } Ok(()) @@ -168,7 +174,7 @@ impl BloomFilterCreator { /// Finalizes any remaining segments and writes the bloom filters and metadata to the provided writer. pub async fn finish(&mut self, mut writer: impl AsyncWrite + Unpin) -> Result<()> { - if !self.cur_seg_distinct_elems.is_empty() { + if self.accumulated_row_count > self.finalized_row_count { self.finalize_segment().await?; } @@ -406,4 +412,35 @@ mod tests { assert!(bf.contains(&b"f")); } } + + #[tokio::test] + async fn test_final_seg_all_null() { + let mut writer = Cursor::new(Vec::new()); + let mut creator = BloomFilterCreator::new( + 2, + Arc::new(MockExternalTempFileProvider::new()), + Arc::new(AtomicUsize::new(0)), + None, + ); + + creator + .push_n_row_elems(4, vec![b"a".to_vec(), b"b".to_vec()]) + .await + .unwrap(); + creator.push_row_elems(Vec::new()).await.unwrap(); + + creator.finish(&mut writer).await.unwrap(); + + let bytes = writer.into_inner(); + let total_size = bytes.len(); + let meta_size_offset = total_size - 4; + let meta_size = u32::from_le_bytes((&bytes[meta_size_offset..]).try_into().unwrap()); + + let meta_bytes = &bytes[total_size - meta_size as usize - 4..total_size - 4]; + let meta = BloomFilterMeta::decode(meta_bytes).unwrap(); + + assert_eq!(meta.rows_per_segment, 2); + assert_eq!(meta.segment_count, 3); + assert_eq!(meta.row_count, 5); + } }