diff --git a/columnar/Cargo.toml b/columnar/Cargo.toml index 0439268b2..6a4f662d0 100644 --- a/columnar/Cargo.toml +++ b/columnar/Cargo.toml @@ -33,6 +33,29 @@ harness = false name = "bench_access" harness = false +[[bench]] +name = "bench_first_vals" +harness = false + +[[bench]] +name = "bench_values_u64" +harness = false + +[[bench]] +name = "bench_values_u128" +harness = false + +[[bench]] +name = "bench_create_column_values" +harness = false + +[[bench]] +name = "bench_column_values_get" +harness = false + +[[bench]] +name = "bench_optional_index" +harness = false + [features] -unstable = [] zstd-compression = ["sstable/zstd-compression"] diff --git a/columnar/benches/bench_access.rs b/columnar/benches/bench_access.rs index a591d3cbc..397a35af0 100644 --- a/columnar/benches/bench_access.rs +++ b/columnar/benches/bench_access.rs @@ -19,7 +19,7 @@ fn main() { let mut add_card = |card1: Card| { inputs.push(( - format!("{card1}"), + card1.to_string(), generate_columnar_and_open(card1, NUM_DOCS), )); }; @@ -50,6 +50,7 @@ fn bench_group(mut runner: InputGroup) { let mut buffer = vec![None; BLOCK_SIZE]; for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) { // fill docs + #[allow(clippy::needless_range_loop)] for idx in 0..BLOCK_SIZE { docs[idx] = idx as u32 + i; } diff --git a/columnar/benches/bench_column_values_get.rs b/columnar/benches/bench_column_values_get.rs new file mode 100644 index 000000000..d486b0dde --- /dev/null +++ b/columnar/benches/bench_column_values_get.rs @@ -0,0 +1,61 @@ +use std::sync::Arc; + +use binggan::{InputGroup, black_box}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tantivy_columnar::ColumnValues; +use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values}; + +fn get_data() -> Vec { + let mut rng = StdRng::seed_from_u64(2u64); + let mut data: Vec<_> = (100..55_000_u64) + .map(|num| num + rng.r#gen::() as u64) + .collect(); + data.push(99_000); + data.insert(1000, 2000); + data.insert(2000, 100); + data.insert(3000, 4100); + data.insert(4000, 100); + data.insert(5000, 800); + data +} + +#[inline(never)] +fn value_iter() -> impl Iterator { + 0..20_000 +} + +type Col = Arc>; + +fn main() { + let data = get_data(); + let inputs: Vec<(String, Col)> = vec![ + ( + "bitpacked".to_string(), + serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Bitpacked]), + ), + ( + "linear".to_string(), + serialize_and_load_u64_based_column_values(&data.as_slice(), &[CodecType::Linear]), + ), + ( + "blockwise_linear".to_string(), + serialize_and_load_u64_based_column_values( + &data.as_slice(), + &[CodecType::BlockwiseLinear], + ), + ), + ]; + + let mut group: InputGroup = InputGroup::new_with_inputs(inputs); + + group.register("fastfield_get", |col: &Col| { + let mut sum = 0u64; + for pos in value_iter() { + sum = sum.wrapping_add(col.get_val(pos as u32)); + } + black_box(sum); + }); + + group.run(); +} diff --git a/columnar/benches/bench_create_column_values.rs b/columnar/benches/bench_create_column_values.rs new file mode 100644 index 000000000..aa04e0661 --- /dev/null +++ b/columnar/benches/bench_create_column_values.rs @@ -0,0 +1,44 @@ +use binggan::{InputGroup, black_box}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tantivy_columnar::column_values::{CodecType, serialize_u64_based_column_values}; + +fn get_data() -> Vec { + let mut rng = StdRng::seed_from_u64(2u64); + let mut data: Vec<_> = (100..55_000_u64) + .map(|num| num + rng.r#gen::() as u64) + .collect(); + data.push(99_000); + data.insert(1000, 2000); + data.insert(2000, 100); + data.insert(3000, 4100); + data.insert(4000, 100); + data.insert(5000, 800); + data +} + +fn main() { + let data = get_data(); + let mut group: InputGroup<(CodecType, Vec)> = InputGroup::new_with_inputs(vec![ + ( + "bitpacked codec".to_string(), + (CodecType::Bitpacked, data.clone()), + ), + ( + "linear codec".to_string(), + (CodecType::Linear, data.clone()), + ), + ( + "blockwise linear codec".to_string(), + (CodecType::BlockwiseLinear, data.clone()), + ), + ]); + + group.register("serialize column_values", |data| { + let mut buffer = Vec::new(); + serialize_u64_based_column_values(&data.1.as_slice(), &[data.0], &mut buffer).unwrap(); + black_box(buffer.len()); + }); + + group.run(); +} diff --git a/columnar/benches/bench_first_vals.rs b/columnar/benches/bench_first_vals.rs index cd0af0e1a..0ce02ee6a 100644 --- a/columnar/benches/bench_first_vals.rs +++ b/columnar/benches/bench_first_vals.rs @@ -1,12 +1,9 @@ -#![feature(test)] -extern crate test; - use std::sync::Arc; +use binggan::{InputGroup, black_box}; use rand::prelude::*; use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values}; use tantivy_columnar::*; -use test::{Bencher, black_box}; struct Columns { pub optional: Column, @@ -68,88 +65,45 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc> = vec![None; 64]; - let fetch_docids = (0..64).collect::>(); - b.iter(move || { + + group.register("first_block_fetch", |column| { + let mut block: Vec> = vec![None; 64]; + let fetch_docids = (0..64).collect::>(); column.first_vals(&fetch_docids, &mut block); - block[0] + black_box(block[0]); }); -} -fn run_bench_on_column_block_single_calls(b: &mut Bencher, column: Column) { - let mut block: Vec> = vec![None; 64]; - let fetch_docids = (0..64).collect::>(); - b.iter(move || { + + group.register("first_block_single_calls", |column| { + let mut block: Vec> = vec![None; 64]; + let fetch_docids = (0..64).collect::>(); for i in 0..fetch_docids.len() { block[i] = column.first(fetch_docids[i]); } - block[0] + black_box(block[0]); }); -} -/// Column first method -#[bench] -fn bench_get_first_on_full_column_full_scan(b: &mut Bencher) { - let column = get_test_columns().full; - run_bench_on_column_full_scan(b, column); -} - -#[bench] -fn bench_get_first_on_optional_column_full_scan(b: &mut Bencher) { - let column = get_test_columns().optional; - run_bench_on_column_full_scan(b, column); -} - -#[bench] -fn bench_get_first_on_multi_column_full_scan(b: &mut Bencher) { - let column = get_test_columns().multi; - run_bench_on_column_full_scan(b, column); -} - -/// Block fetch column accessor -#[bench] -fn bench_get_block_first_on_optional_column(b: &mut Bencher) { - let column = get_test_columns().optional; - run_bench_on_column_block_fetch(b, column); -} - -#[bench] -fn bench_get_block_first_on_multi_column(b: &mut Bencher) { - let column = get_test_columns().multi; - run_bench_on_column_block_fetch(b, column); -} - -#[bench] -fn bench_get_block_first_on_full_column(b: &mut Bencher) { - let column = get_test_columns().full; - run_bench_on_column_block_fetch(b, column); -} - -#[bench] -fn bench_get_block_first_on_optional_column_single_calls(b: &mut Bencher) { - let column = get_test_columns().optional; - run_bench_on_column_block_single_calls(b, column); -} - -#[bench] -fn bench_get_block_first_on_multi_column_single_calls(b: &mut Bencher) { - let column = get_test_columns().multi; - run_bench_on_column_block_single_calls(b, column); -} - -#[bench] -fn bench_get_block_first_on_full_column_single_calls(b: &mut Bencher) { - let column = get_test_columns().full; - run_bench_on_column_block_single_calls(b, column); + group.run(); } diff --git a/columnar/benches/bench_optional_index.rs b/columnar/benches/bench_optional_index.rs new file mode 100644 index 000000000..c157f1455 --- /dev/null +++ b/columnar/benches/bench_optional_index.rs @@ -0,0 +1,106 @@ +use binggan::{InputGroup, black_box}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tantivy_columnar::column_index::{OptionalIndex, Set}; + +const TOTAL_NUM_VALUES: u32 = 1_000_000; + +fn gen_optional_index(fill_ratio: f64) -> OptionalIndex { + let mut rng: StdRng = StdRng::from_seed([1u8; 32]); + let vals: Vec = (0..TOTAL_NUM_VALUES) + .map(|_| rng.gen_bool(fill_ratio)) + .enumerate() + .filter(|(_pos, val)| *val) + .map(|(pos, _)| pos as u32) + .collect(); + OptionalIndex::for_test(TOTAL_NUM_VALUES, &vals) +} + +fn random_range_iterator( + start: u32, + end: u32, + avg_step_size: u32, + avg_deviation: u32, +) -> impl Iterator { + let mut rng: StdRng = StdRng::from_seed([1u8; 32]); + let mut current = start; + std::iter::from_fn(move || { + current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation); + if current >= end { None } else { Some(current) } + }) +} + +fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator { + let ratio = percent / 100.0; + let step_size = (1f32 / ratio) as u32; + let deviation = step_size - 1; + random_range_iterator(0, num_values, step_size, deviation) +} + +fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option { + walk_over_data_from_positions( + codec, + random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0), + ) +} + +fn walk_over_data_from_positions( + codec: &OptionalIndex, + positions: impl Iterator, +) -> Option { + let mut dense_idx: Option = None; + for idx in positions { + dense_idx = dense_idx.or(codec.rank_if_exists(idx)); + } + dense_idx +} + +fn main() { + // Build separate inputs for each fill ratio. + let inputs: Vec<(String, OptionalIndex)> = vec![ + ("fill=1%".to_string(), gen_optional_index(0.01)), + ("fill=5%".to_string(), gen_optional_index(0.05)), + ("fill=10%".to_string(), gen_optional_index(0.10)), + ("fill=50%".to_string(), gen_optional_index(0.50)), + ("fill=90%".to_string(), gen_optional_index(0.90)), + ]; + + let mut group: InputGroup = InputGroup::new_with_inputs(inputs); + + // Translate orig->codec (rank_if_exists) with sampling + group.register("orig_to_codec_10pct_hit", |codec: &OptionalIndex| { + black_box(walk_over_data(codec, 100)); + }); + group.register("orig_to_codec_1pct_hit", |codec: &OptionalIndex| { + black_box(walk_over_data(codec, 1000)); + }); + group.register("orig_to_codec_full_scan", |codec: &OptionalIndex| { + black_box(walk_over_data_from_positions(codec, 0..TOTAL_NUM_VALUES)); + }); + + // Translate codec->orig (select/select_batch) on sampled ranks + fn bench_translate_codec_to_orig_util(codec: &OptionalIndex, percent_hit: f32) { + let num_non_nulls = codec.num_non_nulls(); + let idxs: Vec = if percent_hit == 100.0f32 { + (0..num_non_nulls).collect() + } else { + n_percent_step_iterator(percent_hit, num_non_nulls).collect() + }; + let mut output = vec![0u32; idxs.len()]; + output.copy_from_slice(&idxs[..]); + codec.select_batch(&mut output); + black_box(output); + } + + group.register("codec_to_orig_0.005pct_hit", |codec: &OptionalIndex| { + bench_translate_codec_to_orig_util(codec, 0.005); + }); + group.register("codec_to_orig_10pct_hit", |codec: &OptionalIndex| { + bench_translate_codec_to_orig_util(codec, 10.0); + }); + group.register("codec_to_orig_full_scan", |codec: &OptionalIndex| { + bench_translate_codec_to_orig_util(codec, 100.0); + }); + + group.run(); +} diff --git a/columnar/benches/bench_values_u128.rs b/columnar/benches/bench_values_u128.rs index 0297fbe73..e0b4f0a1f 100644 --- a/columnar/benches/bench_values_u128.rs +++ b/columnar/benches/bench_values_u128.rs @@ -1,15 +1,12 @@ -#![feature(test)] - use std::ops::RangeInclusive; use std::sync::Arc; +use binggan::{InputGroup, black_box}; use common::OwnedBytes; use rand::rngs::StdRng; use rand::seq::SliceRandom; use rand::{Rng, SeedableRng, random}; use tantivy_columnar::ColumnValues; -use test::Bencher; -extern crate test; // TODO does this make sense for IPv6 ? fn generate_random() -> Vec { @@ -47,78 +44,77 @@ fn get_data_50percent_item() -> Vec { } data.push(SINGLE_ITEM); data.shuffle(&mut rng); - let data = data.iter().map(|el| *el as u128).collect::>(); - data + data.iter().map(|el| *el as u128).collect::>() } -#[bench] -fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) { +fn main() { let data = get_data_50percent_item(); - let column = get_u128_column_from_data(&data); + let column_range = get_u128_column_from_data(&data); + let column_random = get_u128_column_random(); - b.iter(|| { + struct Inputs { + data: Vec, + column_range: Arc>, + column_random: Arc>, + } + + let inputs = Inputs { + data, + column_range, + column_random, + }; + let mut group: InputGroup = + InputGroup::new_with_inputs(vec![("u128 benches".to_string(), inputs)]); + + group.register( + "intfastfield_getrange_u128_50percent_hit", + |inp: &Inputs| { + let mut positions = Vec::new(); + inp.column_range.get_row_ids_for_value_range( + *FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128, + 0..inp.data.len() as u32, + &mut positions, + ); + black_box(positions.len()); + }, + ); + + group.register("intfastfield_getrange_u128_single_hit", |inp: &Inputs| { let mut positions = Vec::new(); - column.get_row_ids_for_value_range( - *FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128, - 0..data.len() as u32, - &mut positions, - ); - positions - }); -} - -#[bench] -fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) { - let data = get_data_50percent_item(); - let column = get_u128_column_from_data(&data); - - b.iter(|| { - let mut positions = Vec::new(); - column.get_row_ids_for_value_range( + inp.column_range.get_row_ids_for_value_range( *SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128, - 0..data.len() as u32, + 0..inp.data.len() as u32, &mut positions, ); - positions + black_box(positions.len()); }); -} -#[bench] -fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) { - let data = get_data_50percent_item(); - let column = get_u128_column_from_data(&data); - - b.iter(|| { + group.register("intfastfield_getrange_u128_hit_all", |inp: &Inputs| { let mut positions = Vec::new(); - column.get_row_ids_for_value_range(0..=u128::MAX, 0..data.len() as u32, &mut positions); - positions + inp.column_range.get_row_ids_for_value_range( + 0..=u128::MAX, + 0..inp.data.len() as u32, + &mut positions, + ); + black_box(positions.len()); }); -} -// U128 RANGE END -#[bench] -fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) { - let column = get_u128_column_random(); - - b.iter(|| { + group.register("intfastfield_scan_all_fflookup_u128", |inp: &Inputs| { let mut a = 0u128; - for i in 0u64..column.num_vals() as u64 { - a += column.get_val(i as u32); + for i in 0u64..inp.column_random.num_vals() as u64 { + a += inp.column_random.get_val(i as u32); } - a + black_box(a); }); -} -#[bench] -fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) { - let column = get_u128_column_random(); - - b.iter(|| { - let n = column.num_vals(); + group.register("intfastfield_jumpy_stride5_u128", |inp: &Inputs| { + let n = inp.column_random.num_vals(); let mut a = 0u128; for i in (0..n / 5).map(|val| val * 5) { - a += column.get_val(i); + a += inp.column_random.get_val(i); } - a + black_box(a); }); + + group.run(); } diff --git a/columnar/benches/bench_values_u64.rs b/columnar/benches/bench_values_u64.rs index 26de02e35..36711c776 100644 --- a/columnar/benches/bench_values_u64.rs +++ b/columnar/benches/bench_values_u64.rs @@ -1,13 +1,10 @@ -#![feature(test)] -extern crate test; - use std::ops::RangeInclusive; use std::sync::Arc; +use binggan::{InputGroup, black_box}; use rand::prelude::*; use tantivy_columnar::column_values::{CodecType, serialize_and_load_u64_based_column_values}; use tantivy_columnar::*; -use test::Bencher; // Warning: this generates the same permutation at each call fn generate_permutation() -> Vec { @@ -27,37 +24,11 @@ pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc> = serialize_and_load(&permutation, CodecType::Bitpacked); - b.iter(|| { - let mut a = 0u64; - for _ in 0..n { - a = column.get_val(a as u32); - } - a - }); -} - const FIFTY_PERCENT_RANGE: RangeInclusive = 1..=50; const SINGLE_ITEM: u64 = 90; const SINGLE_ITEM_RANGE: RangeInclusive = 90..=90; const ONE_PERCENT_ITEM_RANGE: RangeInclusive = 49..=49; + fn get_data_50percent_item() -> Vec { let mut rng = StdRng::from_seed([1u8; 32]); @@ -69,135 +40,122 @@ fn get_data_50percent_item() -> Vec { data.push(SINGLE_ITEM); data.shuffle(&mut rng); - let data = data.iter().map(|el| *el as u128).collect::>(); - data + data.iter().map(|el| *el as u128).collect::>() } -// U64 RANGE START -#[bench] -fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) { - let data = get_data_50percent_item(); - let data = data.iter().map(|el| *el as u64).collect::>(); - let column: Arc> = serialize_and_load(&data, CodecType::Bitpacked); - b.iter(|| { - let mut positions = Vec::new(); - column.get_row_ids_for_value_range( - FIFTY_PERCENT_RANGE, - 0..data.len() as u32, - &mut positions, - ); - positions - }); -} +type VecCol = (Vec, Arc>); -#[bench] -fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) { - let data = get_data_50percent_item(); - let data = data.iter().map(|el| *el as u64).collect::>(); - let column: Arc> = serialize_and_load(&data, CodecType::Bitpacked); - - b.iter(|| { - let mut positions = Vec::new(); - column.get_row_ids_for_value_range( - ONE_PERCENT_ITEM_RANGE, - 0..data.len() as u32, - &mut positions, - ); - positions - }); -} - -#[bench] -fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) { - let data = get_data_50percent_item(); - let data = data.iter().map(|el| *el as u64).collect::>(); - let column: Arc> = serialize_and_load(&data, CodecType::Bitpacked); - - b.iter(|| { - let mut positions = Vec::new(); - column.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..data.len() as u32, &mut positions); - positions - }); -} - -#[bench] -fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) { - let data = get_data_50percent_item(); - let data = data.iter().map(|el| *el as u64).collect::>(); - let column: Arc> = serialize_and_load(&data, CodecType::Bitpacked); - - b.iter(|| { - let mut positions = Vec::new(); - column.get_row_ids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions); - positions - }); -} -// U64 RANGE END - -#[bench] -fn bench_intfastfield_stride7_vec(b: &mut Bencher) { +fn bench_access() { let permutation = generate_permutation(); - let n = permutation.len(); - b.iter(|| { + let column_perm: Arc> = + serialize_and_load(&permutation, CodecType::Bitpacked); + + let permutation_gcd = generate_permutation_gcd(); + let column_perm_gcd: Arc> = + serialize_and_load(&permutation_gcd, CodecType::Bitpacked); + + let mut group: InputGroup = InputGroup::new_with_inputs(vec![ + ( + "access".to_string(), + (permutation.clone(), column_perm.clone()), + ), + ( + "access_gcd".to_string(), + (permutation_gcd.clone(), column_perm_gcd.clone()), + ), + ]); + + group.register("stride7_vec", |inp: &VecCol| { + let n = inp.0.len(); let mut a = 0u64; for i in (0..n / 7).map(|val| val * 7) { - a += permutation[i as usize]; + a += inp.0[i]; } - a + black_box(a); }); -} -#[bench] -fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) { - let permutation = generate_permutation(); - let n = permutation.len(); - let column: Arc> = serialize_and_load(&permutation, CodecType::Bitpacked); - b.iter(|| { - let mut a = 0; + group.register("fullscan_vec", |inp: &VecCol| { + let mut a = 0u64; + for i in 0..inp.0.len() { + a += inp.0[i]; + } + black_box(a); + }); + + group.register("stride7_column_values", |inp: &VecCol| { + let n = inp.1.num_vals() as usize; + let mut a = 0u64; for i in (0..n / 7).map(|val| val * 7) { - a += column.get_val(i as u32); + a += inp.1.get_val(i as u32); } - a + black_box(a); }); -} -#[bench] -fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) { - let permutation = generate_permutation(); - let n = permutation.len(); - let column: Arc> = serialize_and_load(&permutation, CodecType::Bitpacked); - let column_ref = column.as_ref(); - b.iter(|| { - let mut a = 0u64; - for i in 0u32..n as u32 { - a += column_ref.get_val(i); - } - a - }); -} - -#[bench] -fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) { - let permutation = generate_permutation_gcd(); - let n = permutation.len(); - let column: Arc> = serialize_and_load(&permutation, CodecType::Bitpacked); - b.iter(|| { + group.register("fullscan_column_values", |inp: &VecCol| { let mut a = 0u64; + let n = inp.1.num_vals() as usize; for i in 0..n { - a += column.get_val(i as u32); + a += inp.1.get_val(i as u32); } - a + black_box(a); }); + + group.run(); } -#[bench] -fn bench_intfastfield_scan_all_vec(b: &mut Bencher) { - let permutation = generate_permutation(); - b.iter(|| { - let mut a = 0u64; - for i in 0..permutation.len() { - a += permutation[i as usize] as u64; - } - a - }); +fn bench_range() { + let data_50 = get_data_50percent_item(); + let data_u64 = data_50.iter().map(|el| *el as u64).collect::>(); + let column_data: Arc> = + serialize_and_load(&data_u64, CodecType::Bitpacked); + + let mut group: InputGroup>> = + InputGroup::new_with_inputs(vec![("dist_50pct_item".to_string(), column_data.clone())]); + + group.register( + "fastfield_getrange_u64_50percent_hit", + |col: &Arc>| { + let mut positions = Vec::new(); + col.get_row_ids_for_value_range(FIFTY_PERCENT_RANGE, 0..col.num_vals(), &mut positions); + black_box(positions.len()); + }, + ); + + group.register( + "fastfield_getrange_u64_1percent_hit", + |col: &Arc>| { + let mut positions = Vec::new(); + col.get_row_ids_for_value_range( + ONE_PERCENT_ITEM_RANGE, + 0..col.num_vals(), + &mut positions, + ); + black_box(positions.len()); + }, + ); + + group.register( + "fastfield_getrange_u64_single_hit", + |col: &Arc>| { + let mut positions = Vec::new(); + col.get_row_ids_for_value_range(SINGLE_ITEM_RANGE, 0..col.num_vals(), &mut positions); + black_box(positions.len()); + }, + ); + + group.register( + "fastfield_getrange_u64_hit_all", + |col: &Arc>| { + let mut positions = Vec::new(); + col.get_row_ids_for_value_range(0..=u64::MAX, 0..col.num_vals(), &mut positions); + black_box(positions.len()); + }, + ); + + group.run(); +} + +fn main() { + bench_access(); + bench_range(); } diff --git a/columnar/src/column_index/optional_index/tests.rs b/columnar/src/column_index/optional_index/tests.rs index 205095d91..160cb454e 100644 --- a/columnar/src/column_index/optional_index/tests.rs +++ b/columnar/src/column_index/optional_index/tests.rs @@ -219,170 +219,3 @@ fn test_optional_index_for_tests() { assert!(!optional_index.contains(3)); assert_eq!(optional_index.num_docs(), 4); } - -#[cfg(all(test, feature = "unstable"))] -mod bench { - - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - use test::Bencher; - - use super::*; - - const TOTAL_NUM_VALUES: u32 = 1_000_000; - fn gen_bools(fill_ratio: f64) -> OptionalIndex { - let mut out = Vec::new(); - let mut rng: StdRng = StdRng::from_seed([1u8; 32]); - let vals: Vec = (0..TOTAL_NUM_VALUES) - .map(|_| rng.gen_bool(fill_ratio)) - .enumerate() - .filter(|(_pos, val)| *val) - .map(|(pos, _)| pos as RowId) - .collect(); - serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap(); - - open_optional_index(OwnedBytes::new(out)).unwrap() - } - - fn random_range_iterator( - start: u32, - end: u32, - avg_step_size: u32, - avg_deviation: u32, - ) -> impl Iterator { - let mut rng: StdRng = StdRng::from_seed([1u8; 32]); - let mut current = start; - std::iter::from_fn(move || { - current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation); - if current >= end { None } else { Some(current) } - }) - } - - fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator { - let ratio = percent / 100.0; - let step_size = (1f32 / ratio) as u32; - let deviation = step_size - 1; - random_range_iterator(0, num_values, step_size, deviation) - } - - fn walk_over_data(codec: &OptionalIndex, avg_step_size: u32) -> Option { - walk_over_data_from_positions( - codec, - random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0), - ) - } - - fn walk_over_data_from_positions( - codec: &OptionalIndex, - positions: impl Iterator, - ) -> Option { - let mut dense_idx: Option = None; - for idx in positions { - dense_idx = dense_idx.or(codec.rank_if_exists(idx)); - } - dense_idx - } - - #[bench] - fn bench_translate_orig_to_codec_1percent_filled_10percent_hit(bench: &mut Bencher) { - let codec = gen_bools(0.01f64); - bench.iter(|| walk_over_data(&codec, 100)); - } - - #[bench] - fn bench_translate_orig_to_codec_5percent_filled_10percent_hit(bench: &mut Bencher) { - let codec = gen_bools(0.05f64); - bench.iter(|| walk_over_data(&codec, 100)); - } - - #[bench] - fn bench_translate_orig_to_codec_5percent_filled_1percent_hit(bench: &mut Bencher) { - let codec = gen_bools(0.05f64); - bench.iter(|| walk_over_data(&codec, 1000)); - } - - #[bench] - fn bench_translate_orig_to_codec_full_scan_1percent_filled(bench: &mut Bencher) { - let codec = gen_bools(0.01f64); - bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES)); - } - - #[bench] - fn bench_translate_orig_to_codec_full_scan_10percent_filled(bench: &mut Bencher) { - let codec = gen_bools(0.1f64); - bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES)); - } - - #[bench] - fn bench_translate_orig_to_codec_full_scan_90percent_filled(bench: &mut Bencher) { - let codec = gen_bools(0.9f64); - bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES)); - } - - #[bench] - fn bench_translate_orig_to_codec_10percent_filled_1percent_hit(bench: &mut Bencher) { - let codec = gen_bools(0.1f64); - bench.iter(|| walk_over_data(&codec, 100)); - } - - #[bench] - fn bench_translate_orig_to_codec_50percent_filled_1percent_hit(bench: &mut Bencher) { - let codec = gen_bools(0.5f64); - bench.iter(|| walk_over_data(&codec, 100)); - } - - #[bench] - fn bench_translate_orig_to_codec_90percent_filled_1percent_hit(bench: &mut Bencher) { - let codec = gen_bools(0.9f64); - bench.iter(|| walk_over_data(&codec, 100)); - } - - #[bench] - fn bench_translate_codec_to_orig_1percent_filled_0comma005percent_hit(bench: &mut Bencher) { - bench_translate_codec_to_orig_util(0.01f64, 0.005f32, bench); - } - - #[bench] - fn bench_translate_codec_to_orig_10percent_filled_0comma005percent_hit(bench: &mut Bencher) { - bench_translate_codec_to_orig_util(0.1f64, 0.005f32, bench); - } - - #[bench] - fn bench_translate_codec_to_orig_1percent_filled_10percent_hit(bench: &mut Bencher) { - bench_translate_codec_to_orig_util(0.01f64, 10f32, bench); - } - - #[bench] - fn bench_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) { - bench_translate_codec_to_orig_util(0.01f64, 100f32, bench); - } - - fn bench_translate_codec_to_orig_util( - percent_filled: f64, - percent_hit: f32, - bench: &mut Bencher, - ) { - let codec = gen_bools(percent_filled); - let num_non_nulls = codec.num_non_nulls(); - let idxs: Vec = if percent_hit == 100.0f32 { - (0..num_non_nulls).collect() - } else { - n_percent_step_iterator(percent_hit, num_non_nulls).collect() - }; - let mut output = vec![0u32; idxs.len()]; - bench.iter(|| { - output.copy_from_slice(&idxs[..]); - codec.select_batch(&mut output); - }); - } - - #[bench] - fn bench_translate_codec_to_orig_90percent_filled_0comma005percent_hit(bench: &mut Bencher) { - bench_translate_codec_to_orig_util(0.9f64, 0.005, bench); - } - - #[bench] - fn bench_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) { - bench_translate_codec_to_orig_util(0.9f64, 100.0f32, bench); - } -} diff --git a/columnar/src/column_values/bench.rs b/columnar/src/column_values/bench.rs deleted file mode 100644 index 5623b5fb3..000000000 --- a/columnar/src/column_values/bench.rs +++ /dev/null @@ -1,139 +0,0 @@ -use std::sync::Arc; - -use common::OwnedBytes; -use rand::rngs::StdRng; -use rand::{Rng, SeedableRng}; -use test::{self, Bencher}; - -use super::*; -use crate::column_values::u64_based::*; - -fn get_data() -> Vec { - let mut rng = StdRng::seed_from_u64(2u64); - let mut data: Vec<_> = (100..55000_u64) - .map(|num| num + rng.r#gen::() as u64) - .collect(); - data.push(99_000); - data.insert(1000, 2000); - data.insert(2000, 100); - data.insert(3000, 4100); - data.insert(4000, 100); - data.insert(5000, 800); - data -} - -fn compute_stats(vals: impl Iterator) -> ColumnStats { - let mut stats_collector = StatsCollector::default(); - for val in vals { - stats_collector.collect(val); - } - stats_collector.stats() -} - -#[inline(never)] -fn value_iter() -> impl Iterator { - 0..20_000 -} - -fn get_reader_for_bench(data: &[u64]) -> Codec::ColumnValues { - let mut bytes = Vec::new(); - let stats = compute_stats(data.iter().cloned()); - let mut codec_serializer = Codec::estimator(); - for val in data { - codec_serializer.collect(*val); - } - codec_serializer - .serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes) - .unwrap(); - - Codec::load(OwnedBytes::new(bytes)).unwrap() -} - -fn bench_get(b: &mut Bencher, data: &[u64]) { - let col = get_reader_for_bench::(data); - b.iter(|| { - let mut sum = 0u64; - for pos in value_iter() { - let val = col.get_val(pos as u32); - sum = sum.wrapping_add(val); - } - sum - }); -} - -#[inline(never)] -fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc) { - b.iter(|| { - let mut sum = 0u64; - for pos in value_iter() { - let val = col.get_val(pos as u32); - sum = sum.wrapping_add(val); - } - sum - }); -} - -fn bench_get_dynamic(b: &mut Bencher, data: &[u64]) { - let col = Arc::new(get_reader_for_bench::(data)); - bench_get_dynamic_helper(b, col); -} -fn bench_create(b: &mut Bencher, data: &[u64]) { - let stats = compute_stats(data.iter().cloned()); - - let mut bytes = Vec::new(); - b.iter(|| { - bytes.clear(); - let mut codec_serializer = Codec::estimator(); - for val in data.iter().take(1024) { - codec_serializer.collect(*val); - } - - codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes) - }); -} - -#[bench] -fn bench_fastfield_bitpack_create(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_create::(b, &data); -} -#[bench] -fn bench_fastfield_linearinterpol_create(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_create::(b, &data); -} -#[bench] -fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_create::(b, &data); -} -#[bench] -fn bench_fastfield_bitpack_get(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_get::(b, &data); -} -#[bench] -fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_get_dynamic::(b, &data); -} -#[bench] -fn bench_fastfield_linearinterpol_get(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_get::(b, &data); -} -#[bench] -fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_get_dynamic::(b, &data); -} -#[bench] -fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_get::(b, &data); -} -#[bench] -fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_get_dynamic::(b, &data); -} diff --git a/columnar/src/column_values/mod.rs b/columnar/src/column_values/mod.rs index bc61c752e..f26bf6d33 100644 --- a/columnar/src/column_values/mod.rs +++ b/columnar/src/column_values/mod.rs @@ -242,6 +242,3 @@ impl ColumnValues for Arc