From 54972caa7c744f2089b28c352d489117a887a5ed Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 29 Aug 2022 11:57:33 +0200 Subject: [PATCH 1/2] remove Column impl on Vec remove Column impl on Vec to avoid function shadowing --- fastfield_codecs/src/lib.rs | 25 ++++--------------------- fastfield_codecs/src/main.rs | 29 +++++++++++++++++++++++++++-- src/indexer/merger.rs | 4 ++-- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 602c7fda0..deb3e7943 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -90,6 +90,7 @@ pub struct FastFieldStats { pub num_vals: u64, } +#[cfg(test)] impl<'a> Column for &'a [u64] { fn get_val(&self, position: u64) -> u64 { self[position as usize] @@ -112,26 +113,6 @@ impl<'a> Column for &'a [u64] { } } -impl Column for Vec { - fn get_val(&self, position: u64) -> u64 { - self[position as usize] - } - fn iter<'b>(&'b self) -> Box + 'b> { - Box::new((self as &[u64]).iter().cloned()) - } - fn min_value(&self) -> u64 { - self.iter().min().unwrap_or(0) - } - - fn max_value(&self) -> u64 { - self.iter().max().unwrap_or(0) - } - - fn num_vals(&self) -> u64 { - self.len() as u64 - } -} - #[cfg(test)] mod tests { use proptest::arbitrary::any; @@ -235,6 +216,7 @@ mod tests { fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); + let data = data.as_slice(); let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.01); @@ -247,7 +229,7 @@ mod tests { } #[test] fn estimation_test_bad_interpolation_case() { - let data = vec![200, 10, 10, 10, 10, 1000, 20]; + let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20]; let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.32); @@ -259,6 +241,7 @@ mod tests { fn estimation_test_bad_interpolation_case_monotonically_increasing() { let mut data: Vec = (200..=20000_u64).collect(); data.push(1_000_000); + let data = data.as_slice(); // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 848392b66..cbe5b3198 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -3,9 +3,33 @@ extern crate prettytable; use fastfield_codecs::bitpacked::BitpackedCodec; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; use fastfield_codecs::linear::LinearCodec; -use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats}; +use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats}; use prettytable::{Cell, Row, Table}; +struct Data<'a>(&'a [u64]); + +impl<'a> Column for Data<'a> { + fn get_val(&self, position: u64) -> u64 { + self.0[position as usize] + } + + fn iter<'b>(&'b self) -> Box + 'b> { + Box::new(self.0.iter().cloned()) + } + + fn min_value(&self) -> u64 { + *self.0.iter().min().unwrap_or(&0) + } + + fn max_value(&self) -> u64 { + *self.0.iter().max().unwrap_or(&0) + } + + fn num_vals(&self) -> u64 { + self.0.len() as u64 + } +} + fn main() { let mut table = Table::new(); @@ -86,10 +110,11 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { pub fn serialize_with_codec( data: &[u64], ) -> Option<(f32, f32, FastFieldCodecType)> { + let data = Data(data); let estimation = C::estimate(&data)?; let mut out = Vec::new(); C::serialize(&mut out, &data).unwrap(); - let actual_compression = out.len() as f32 / (data.len() * 8) as f32; + let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32; Some((estimation, actual_compression, C::CODEC_TYPE)) } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index a3d6f9408..fa2bac324 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -133,7 +133,7 @@ impl TermOrdinalMapping { fn max_term_ord(&self) -> TermOrdinal { self.per_segment_new_term_ordinals .iter() - .flat_map(|term_ordinals| term_ordinals.iter().max()) + .flat_map(|term_ordinals| term_ordinals.iter().max().cloned()) .max() .unwrap_or_default() } @@ -784,7 +784,7 @@ impl IndexMerger { let new_doc_id: DocId = self.offsets .iter() - .position(|offset| offset > pos) + .position(|&offset| offset > pos) .expect("pos is out of bounds") as DocId - 1u32; From 7a26cc902282b36ad39f4d233e647e4dd1051abf Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 29 Aug 2022 15:49:43 +0200 Subject: [PATCH 2/2] add VecColumn --- fastfield_codecs/src/lib.rs | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index deb3e7943..ccba50065 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -90,26 +90,32 @@ pub struct FastFieldStats { pub num_vals: u64, } -#[cfg(test)] -impl<'a> Column for &'a [u64] { +struct VecColum<'a>(&'a [u64]); +impl<'a> Column for VecColum<'a> { fn get_val(&self, position: u64) -> u64 { - self[position as usize] + self.0[position as usize] } fn iter<'b>(&'b self) -> Box + 'b> { - Box::new((self as &[u64]).iter().cloned()) + Box::new(self.0.iter().cloned()) } fn min_value(&self) -> u64 { - self.iter().min().unwrap_or(0) + self.0.iter().min().cloned().unwrap_or(0) } fn max_value(&self) -> u64 { - self.iter().max().unwrap_or(0) + self.0.iter().max().cloned().unwrap_or(0) } fn num_vals(&self) -> u64 { - self.len() as u64 + self.0.len() as u64 + } +} + +impl<'a> From<&'a [u64]> for VecColum<'a> { + fn from(data: &'a [u64]) -> Self { + Self(data) } } @@ -126,10 +132,10 @@ mod tests { data: &[u64], name: &str, ) -> Option<(f32, f32)> { - let estimation = Codec::estimate(&data)?; + let estimation = Codec::estimate(&VecColum::from(data))?; let mut out: Vec = Vec::new(); - Codec::serialize(&mut out, &data).unwrap(); + Codec::serialize(&mut out, &VecColum::from(data)).unwrap(); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); @@ -215,8 +221,8 @@ mod tests { #[test] fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); + let data: VecColum = data.as_slice().into(); - let data = data.as_slice(); let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.01); @@ -231,6 +237,7 @@ mod tests { fn estimation_test_bad_interpolation_case() { let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20]; + let data: VecColum = data.into(); let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap(); assert_le!(linear_interpol_estimation, 0.32); @@ -241,7 +248,7 @@ mod tests { fn estimation_test_bad_interpolation_case_monotonically_increasing() { let mut data: Vec = (200..=20000_u64).collect(); data.push(1_000_000); - let data = data.as_slice(); + let data: VecColum = data.as_slice().into(); // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior