From e8a6e123ae520e907ce534bf9301343f9868b3dc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 27 Aug 2022 21:26:48 +0200 Subject: [PATCH] Small refactoring estimate. --- fastfield_codecs/src/bitpacked.rs | 12 ++--- fastfield_codecs/src/blockwise_linear.rs | 33 ++++++------ fastfield_codecs/src/lib.rs | 64 ++++++++++++------------ fastfield_codecs/src/linear.rs | 37 +++++++------- fastfield_codecs/src/main.rs | 47 +++++++---------- src/fastfield/serializer/mod.rs | 6 +-- 6 files changed, 92 insertions(+), 107 deletions(-) diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index 76f9785ec..4270877bd 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -141,21 +141,19 @@ impl FastFieldCodec for BitpackedCodec { Ok(()) } - fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool { - true - } - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { + + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option { let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value(); let num_bits = compute_num_bits(amplitude); let num_bits_uncompressed = 64; - num_bits as f32 / num_bits_uncompressed as f32 + Some(num_bits as f32 / num_bits_uncompressed as f32) } } #[cfg(test)] mod tests { use super::*; - use crate::tests::get_codec_test_data_sets; + use crate::tests::get_codec_test_datasets; fn create_and_validate(data: &[u64], name: &str) { crate::tests::create_and_validate::(data, name); @@ -163,7 +161,7 @@ mod tests { #[test] fn test_with_codec_data_sets() { - let data_sets = get_codec_test_data_sets(); + let data_sets = get_codec_test_datasets(); for (mut data, name) in data_sets { create_and_validate(&data, name); data.reverse(); diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 6b5e380f7..619d1faca 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -289,10 +289,14 @@ impl FastFieldCodec for BlockwiseLinearCodec { Ok(()) } - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool { - if fastfield_accessor.num_vals() < 5_000 { - return false; + /// estimation for linear interpolation is hard because, you don't know + /// where the local maxima are for the deviation of the calculated value and + /// the offset is also unknown. + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option { + if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE { + return None; } + // On serialization the offset is added to the actual value. // We need to make sure this won't run into overflow calculation issues. // For this we take the maximum theroretical offset and add this to the max value. @@ -304,14 +308,9 @@ impl FastFieldCodec for BlockwiseLinearCodec { .checked_add(theorethical_maximum_offset) .is_none() { - return false; + return None; } - true - } - /// estimation for linear interpolation is hard because, you don't know - /// where the local maxima are for the deviation of the calculated value and - /// the offset is also unknown. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { + let first_val_in_first_block = fastfield_accessor.get_val(0); let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals()); let last_val_in_first_block = @@ -350,7 +349,7 @@ impl FastFieldCodec for BlockwiseLinearCodec { // function metadata per block + 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE); let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); - num_bits as f32 / num_bits_uncompressed as f32 + Some(num_bits as f32 / num_bits_uncompressed as f32) } } @@ -365,10 +364,10 @@ fn distance + Ord>(x: T, y: T) -> T { #[cfg(test)] mod tests { use super::*; - use crate::tests::get_codec_test_data_sets; + use crate::tests::get_codec_test_datasets; - fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::(data, name) + fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> { + crate::tests::create_and_validate::(data, name) } const HIGHEST_BIT: u64 = 1 << 63; @@ -382,7 +381,7 @@ mod tests { .map(i64_to_u64) .collect::>(); let (estimate, actual_compression) = - create_and_validate(&data, "simple monotonically large i64"); + create_and_validate(&data, "simple monotonically large i64").unwrap(); assert!(actual_compression < 0.2); assert!(estimate < 0.20); assert!(estimate > 0.15); @@ -393,7 +392,7 @@ mod tests { fn test_compression() { let data = (10..=6_000_u64).collect::>(); let (estimate, actual_compression) = - create_and_validate(&data, "simple monotonically large"); + create_and_validate(&data, "simple monotonically large").unwrap(); assert!(actual_compression < 0.2); assert!(estimate < 0.20); assert!(estimate > 0.15); @@ -402,7 +401,7 @@ mod tests { #[test] fn test_with_codec_data_sets() { - let data_sets = get_codec_test_data_sets(); + let data_sets = get_codec_test_datasets(); for (mut data, name) in data_sets { create_and_validate(&data, name); data.reverse(); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index c12449526..172f7e0d9 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -82,15 +82,14 @@ pub trait FastFieldCodec { fastfield_accessor: &dyn FastFieldDataAccess, ) -> io::Result<()>; - /// Check if the Codec is able to compress the data - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool; - /// Returns an estimate of the compression ratio. + /// If the codec is not applicable, returns `None`. + /// /// The baseline is uncompressed 64bit data. /// /// It could make sense to also return a value representing /// computational complexity. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32; + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option; } #[derive(Debug, Clone)] @@ -152,11 +151,12 @@ mod tests { use crate::blockwise_linear::BlockwiseLinearCodec; use crate::linear::LinearCodec; - pub fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - if !Codec::is_applicable(&data) { - return (f32::MAX, 0.0); - } - let estimation = Codec::estimate(&data); + pub fn create_and_validate( + data: &[u64], + name: &str, + ) -> Option<(f32, f32)> { + let estimation = Codec::estimate(&data)?; + let mut out: Vec = Vec::new(); Codec::serialize(&mut out, &data).unwrap(); @@ -164,16 +164,15 @@ mod tests { let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap(); assert_eq!(reader.num_vals(), data.len() as u64); - for (doc, orig_val) in data.iter().enumerate() { + for (doc, orig_val) in data.iter().copied().enumerate() { let val = reader.get_val(doc as u64); - if val != *orig_val { - panic!( - "val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \ - {data:?}", - ); - } + assert_eq!( + val, orig_val, + "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \ + `{data:?}`", + ); } - (estimation, actual_compression) + Some((estimation, actual_compression)) } proptest! { @@ -193,10 +192,10 @@ mod tests { } - pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { + pub fn get_codec_test_datasets() -> Vec<(Vec, &'static str)> { let mut data_and_names = vec![]; - let data = (10..=20_u64).collect::>(); + let data = (10..=10_000_u64).collect::>(); data_and_names.push((data, "simple monotonically increasing")); data_and_names.push(( @@ -211,12 +210,13 @@ mod tests { fn test_codec() { let codec_name = format!("{:?}", C::CODEC_TYPE); - for (data, dataset_name) in get_codec_test_data_sets() { - let (estimate, actual) = crate::tests::create_and_validate::(&data, dataset_name); - let result = if estimate == f32::MAX { - "Disabled".to_string() - } else { + for (data, dataset_name) in get_codec_test_datasets() { + let estimate_actual_opt: Option<(f32, f32)> = + crate::tests::create_and_validate::(&data, dataset_name); + let result = if let Some((estimate, actual)) = estimate_actual_opt { format!("Estimate `{estimate}` Actual `{actual}`") + } else { + "Disabled".to_string() }; println!("Codec {codec_name}, DataSet {dataset_name}, {result}"); } @@ -240,37 +240,37 @@ mod tests { fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); - let linear_interpol_estimation = LinearCodec::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.01); - let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data); + let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap(); assert_le!(multi_linear_interpol_estimation, 0.2); assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); - let bitpacked_estimation = BitpackedCodec::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, bitpacked_estimation); } #[test] fn estimation_test_bad_interpolation_case() { let data = vec![200, 10, 10, 10, 10, 1000, 20]; - let linear_interpol_estimation = LinearCodec::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.32); - let bitpacked_estimation = BitpackedCodec::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); assert_le!(bitpacked_estimation, linear_interpol_estimation); } #[test] fn estimation_test_bad_interpolation_case_monotonically_increasing() { - let mut data = (200..=20000_u64).collect::>(); + let mut data: Vec = (200..=20000_u64).collect(); data.push(1_000_000); // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior - let linear_interpol_estimation = LinearCodec::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.35); - let bitpacked_estimation = BitpackedCodec::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap(); assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, linear_interpol_estimation); } diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index d2d53143d..e49b202d8 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -192,10 +192,15 @@ impl FastFieldCodec for LinearCodec { footer.serialize(write)?; Ok(()) } - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool { + + /// estimation for linear interpolation is hard because, you don't know + /// where the local maxima for the deviation of the calculated value are and + /// the offset to shift all values to >=0 is also unknown. + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option { if fastfield_accessor.num_vals() < 3 { - return false; // disable compressor for this case + return None; // disable compressor for this case } + // On serialisation the offset is added to the actual value. // We need to make sure this won't run into overflow calculation issues. // For this we take the maximum theroretical offset and add this to the max value. @@ -207,14 +212,9 @@ impl FastFieldCodec for LinearCodec { .checked_add(theorethical_maximum_offset) .is_none() { - return false; + return None; } - true - } - /// estimation for linear interpolation is hard because, you don't know - /// where the local maxima for the deviation of the calculated value are and - /// the offset to shift all values to >=0 is also unknown. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { + let first_val = fastfield_accessor.get_val(0); let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1); let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals()); @@ -246,7 +246,7 @@ impl FastFieldCodec for LinearCodec { * fastfield_accessor.num_vals() + LinearFooter::SIZE_IN_BYTES as u64; let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); - num_bits as f32 / num_bits_uncompressed as f32 + Some(num_bits as f32 / num_bits_uncompressed as f32) } } @@ -262,10 +262,10 @@ fn distance + Ord>(x: T, y: T) -> T { #[cfg(test)] mod tests { use super::*; - use crate::tests::get_codec_test_data_sets; + use crate::tests::get_codec_test_datasets; - fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::(data, name) + fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> { + crate::tests::create_and_validate::(data, name) } #[test] @@ -292,15 +292,15 @@ mod tests { fn test_compression() { let data = (10..=6_000_u64).collect::>(); let (estimate, actual_compression) = - create_and_validate(&data, "simple monotonically large"); + create_and_validate(&data, "simple monotonically large").unwrap(); assert!(actual_compression < 0.01); assert!(estimate < 0.01); } #[test] - fn test_with_codec_data_sets() { - let data_sets = get_codec_test_data_sets(); + fn test_with_codec_datasets() { + let data_sets = get_codec_test_datasets(); for (mut data, name) in data_sets { create_and_validate(&data, name); data.reverse(); @@ -337,9 +337,10 @@ mod tests { #[test] fn linear_interpol_fast_field_rand() { for _ in 0..5000 { - let mut data = (0..50).map(|_| rand::random::()).collect::>(); + let mut data = (0..10_000) + .map(|_| rand::random::()) + .collect::>(); create_and_validate(&data, "random"); - data.reverse(); create_and_validate(&data, "random"); } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 93204cb25..848392b66 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,5 +1,6 @@ #[macro_use] extern crate prettytable; +use fastfield_codecs::bitpacked::BitpackedCodec; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats}; @@ -12,37 +13,30 @@ fn main() { table.add_row(row!["", "Compression Ratio", "Compression Estimation"]); for (data, data_set_name) in get_codec_test_data_sets() { - let mut results = vec![]; - let res = serialize_with_codec::(&data); - results.push(res); - let res = serialize_with_codec::(&data); - results.push(res); - let res = serialize_with_codec::(&data); - results.push(res); - - // let best_estimation_codec = results - //.iter() - //.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap()) - //.unwrap(); + let results: Vec<(f32, f32, FastFieldCodecType)> = [ + serialize_with_codec::(&data), + serialize_with_codec::(&data), + serialize_with_codec::(&data), + serialize_with_codec::(&data), + ] + .into_iter() + .flatten() + .collect(); let best_compression_ratio_codec = results .iter() - .min_by(|res1, res2| res1.partial_cmp(res2).unwrap()) + .min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap()) .cloned() .unwrap(); table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")])); - for (is_applicable, est, comp, codec_type) in results { - let (est_cell, ratio_cell) = if !is_applicable { - ("Codec Disabled".to_string(), "".to_string()) - } else { - (est.to_string(), comp.to_string()) - }; + for (est, comp, codec_type) in results { + let est_cell = est.to_string(); + let ratio_cell = comp.to_string(); let style = if comp == best_compression_ratio_codec.1 { "Fb" } else { "" }; - table.add_row(Row::new(vec![ Cell::new(&format!("{codec_type:?}")).style_spec("bFg"), Cell::new(&ratio_cell).style_spec(style), @@ -91,17 +85,12 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { pub fn serialize_with_codec( data: &[u64], -) -> (bool, f32, f32, FastFieldCodecType) { - let is_applicable = C::is_applicable(&data); - if !is_applicable { - return (false, 0.0, 0.0, C::CODEC_TYPE); - } - let estimation = C::estimate(&data); - let mut out = vec![]; +) -> Option<(f32, f32, FastFieldCodecType)> { + let estimation = C::estimate(&data)?; + let mut out = Vec::new(); C::serialize(&mut out, &data).unwrap(); - let actual_compression = out.len() as f32 / (data.len() * 8) as f32; - (true, estimation, actual_compression, C::CODEC_TYPE) + Some((estimation, actual_compression, C::CODEC_TYPE)) } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 6bbb33faf..fbda73b5a 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -68,11 +68,9 @@ fn codec_estimation( fastfield_accessor: &impl FastFieldDataAccess, estimations: &mut Vec<(f32, FastFieldCodecType)>, ) { - if !C::is_applicable(fastfield_accessor) { - return; + if let Some(ratio) = C::estimate(fastfield_accessor) { + estimations.push((ratio, C::CODEC_TYPE)); } - let ratio = C::estimate(fastfield_accessor); - estimations.push((ratio, C::CODEC_TYPE)); } impl CompositeFastFieldSerializer {