From 6c53f9f37f15b5d5ebb64cb3bae5c3a7d18b2199 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 9 Jan 2024 14:43:37 +0800 Subject: [PATCH] Fix off by one in optional index Fixes #2293 Fixes an off by one error in the metadata resize of the optional index when loading the index. Merge variables with the same meaning but different names --- columnar/src/column/mod.rs | 3 ++ columnar/src/column_index/mod.rs | 2 ++ .../src/column_index/optional_index/mod.rs | 9 +++--- .../src/column_index/optional_index/tests.rs | 31 ++++++++++++++++--- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/columnar/src/column/mod.rs b/columnar/src/column/mod.rs index 37db03e1b..5b0d066ec 100644 --- a/columnar/src/column/mod.rs +++ b/columnar/src/column/mod.rs @@ -110,6 +110,9 @@ impl Column { } /// Get the docids of values which are in the provided value range. + /// + /// # Panic + /// Panics if a value in the selected_docid_range range is larger than the number of documents. #[inline] pub fn get_docids_for_value_range( &self, diff --git a/columnar/src/column_index/mod.rs b/columnar/src/column_index/mod.rs index d6711566d..f68addb21 100644 --- a/columnar/src/column_index/mod.rs +++ b/columnar/src/column_index/mod.rs @@ -126,6 +126,8 @@ impl ColumnIndex { } } + /// # Panic + /// Panics if a value in the doc_id range is larger than the number of documents. pub fn docid_range_to_rowids(&self, doc_id: Range) -> Range { match self { ColumnIndex::Empty { .. } => 0..0, diff --git a/columnar/src/column_index/optional_index/mod.rs b/columnar/src/column_index/optional_index/mod.rs index e885ee5bc..53cda64f1 100644 --- a/columnar/src/column_index/optional_index/mod.rs +++ b/columnar/src/column_index/optional_index/mod.rs @@ -21,8 +21,6 @@ const DENSE_BLOCK_THRESHOLD: u32 = const ELEMENTS_PER_BLOCK: u32 = u16::MAX as u32 + 1; -const BLOCK_SIZE: RowId = 1 << 16; - #[derive(Copy, Clone, Debug)] struct BlockMeta { non_null_rows_before_block: u32, @@ -109,8 +107,8 @@ struct RowAddr { #[inline(always)] fn row_addr_from_row_id(row_id: RowId) -> RowAddr { RowAddr { - block_id: (row_id / BLOCK_SIZE) as u16, - in_block_row_id: (row_id % BLOCK_SIZE) as u16, + block_id: (row_id / ELEMENTS_PER_BLOCK) as u16, + in_block_row_id: (row_id % ELEMENTS_PER_BLOCK) as u16, } } @@ -490,8 +488,9 @@ fn deserialize_optional_index_block_metadatas( start_byte_offset += block_variant.num_bytes_in_block(); non_null_rows_before_block += num_non_null_rows; } + let last_block = row_addr_from_row_id(num_rows).block_id; block_metas.resize( - ((num_rows + BLOCK_SIZE - 1) / BLOCK_SIZE) as usize, + last_block as usize + 1, // +1 since last block is an index BlockMeta { non_null_rows_before_block, start_byte_offset, diff --git a/columnar/src/column_index/optional_index/tests.rs b/columnar/src/column_index/optional_index/tests.rs index 2acc5f6e6..bdee249c4 100644 --- a/columnar/src/column_index/optional_index/tests.rs +++ b/columnar/src/column_index/optional_index/tests.rs @@ -3,6 +3,29 @@ use proptest::strategy::Strategy; use proptest::{prop_oneof, proptest}; use super::*; +use crate::{ColumnarReader, ColumnarWriter, DynamicColumnHandle}; + +#[test] +fn test_optional_index_bug_2293() { + test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK - 1); + test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK); + test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK + 1); +} +fn test_optional_index_with_num_docs(num_docs: u32) { + let mut dataframe_writer = ColumnarWriter::default(); + dataframe_writer.record_numerical(100, "score", 80i64); + let mut buffer: Vec = Vec::new(); + dataframe_writer + .serialize(num_docs, None, &mut buffer) + .unwrap(); + let columnar = ColumnarReader::open(buffer).unwrap(); + assert_eq!(columnar.num_columns(), 1); + let cols: Vec = columnar.read_columns("score").unwrap(); + assert_eq!(cols.len(), 1); + + let col = cols[0].open().unwrap(); + col.column_index().docid_range_to_rowids(0..num_docs); +} #[test] fn test_dense_block_threshold() { @@ -35,7 +58,7 @@ proptest! { #[test] fn test_with_random_sets_simple() { - let vals = 10..BLOCK_SIZE * 2; + let vals = 10..ELEMENTS_PER_BLOCK * 2; let mut out: Vec = Vec::new(); serialize_optional_index(&vals, 100, &mut out).unwrap(); let null_index = open_optional_index(OwnedBytes::new(out)).unwrap(); @@ -171,7 +194,7 @@ fn test_optional_index_rank() { test_optional_index_rank_aux(&[0u32, 1u32]); let mut block = Vec::new(); block.push(3u32); - block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1)); + block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); test_optional_index_rank_aux(&block); } @@ -185,8 +208,8 @@ fn test_optional_index_iter_empty_one() { fn test_optional_index_iter_dense_block() { let mut block = Vec::new(); block.push(3u32); - block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1)); - test_optional_index_iter_aux(&block, 3 * BLOCK_SIZE); + block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); + test_optional_index_iter_aux(&block, 3 * ELEMENTS_PER_BLOCK); } #[test]