From 00ebff3c16102f8e92715be85bd266acbd7771ad Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 24 Aug 2022 15:28:57 +0200 Subject: [PATCH] move fastfield stats to trait --- fastfield_codecs/benches/bench.rs | 2 +- fastfield_codecs/src/bitpacked.rs | 19 +++--- fastfield_codecs/src/lib.rs | 63 ++++++++++++++------ fastfield_codecs/src/linearinterpol.rs | 42 +++++++------ fastfield_codecs/src/main.rs | 6 +- fastfield_codecs/src/multilinearinterpol.rs | 41 ++++++------- src/fastfield/serializer/mod.rs | 62 ++++++++++---------- src/fastfield/writer.rs | 25 ++++++-- src/indexer/merger.rs | 65 ++++++++++++++++++--- 9 files changed, 205 insertions(+), 120 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 60b189c07..50cc96c65 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -45,7 +45,7 @@ mod tests { fn bench_create(b: &mut Bencher, data: &[u64]) { let mut bytes = vec![]; b.iter(|| { - S::serialize(&mut bytes, &data, stats_from_vec(data)).unwrap(); + S::serialize(&mut bytes, &data).unwrap(); }); } diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index a49c801b6..ff2eab04a 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -4,7 +4,7 @@ use common::BinarySerializable; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; -use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; +use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess}; /// Depending on the field type, a different /// fast field is required. @@ -112,10 +112,12 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer { fn serialize( write: &mut impl Write, fastfield_accessor: &dyn FastFieldDataAccess, - stats: FastFieldStats, ) -> io::Result<()> { - let mut serializer = - BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?; + let mut serializer = BitpackedFastFieldSerializerLegacy::open( + write, + fastfield_accessor.min_value(), + fastfield_accessor.max_value(), + )?; for val in fastfield_accessor.iter() { serializer.add_val(val)?; @@ -124,14 +126,11 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer { Ok(()) } - fn is_applicable( - _fastfield_accessor: &impl FastFieldDataAccess, - _stats: FastFieldStats, - ) -> bool { + fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool { true } - fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 { - let amplitude = stats.max_value - stats.min_value; + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { + let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value(); let num_bits = compute_num_bits(amplitude); let num_bits_uncompressed = 64; num_bits as f32 / num_bits_uncompressed as f32 diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index c7688f143..c97dfe6ea 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -28,14 +28,14 @@ pub trait FastFieldCodecSerializer { const ID: u8; /// Check if the Codec is able to compress the data - fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool; + fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool; /// Returns an estimate of the compression ratio. /// The baseline is uncompressed 64bit data. /// /// It could make sense to also return a value representing /// computational complexity. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32; + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32; /// Serializes the data using the serializer into write. /// @@ -44,7 +44,6 @@ pub trait FastFieldCodecSerializer { fn serialize( write: &mut impl Write, fastfield_accessor: &dyn FastFieldDataAccess, - stats: FastFieldStats, ) -> io::Result<()>; } @@ -62,6 +61,15 @@ pub trait FastFieldDataAccess { /// Returns a iterator over the data fn iter<'a>(&'a self) -> Box + 'a>; + + /// min value of the data + fn min_value(&self) -> u64; + + /// max value of the data + fn max_value(&self) -> u64; + + /// num vals + fn num_vals(&self) -> u64; } #[derive(Debug, Clone)] @@ -80,6 +88,18 @@ impl<'a> FastFieldDataAccess for &'a [u64] { fn iter<'b>(&'b self) -> Box + 'b> { Box::new((self as &[u64]).iter().cloned()) } + + fn min_value(&self) -> u64 { + self.iter().min().unwrap_or(0) + } + + fn max_value(&self) -> u64 { + self.iter().max().unwrap_or(0) + } + + fn num_vals(&self) -> u64 { + self.len() as u64 + } } impl FastFieldDataAccess for Vec { @@ -89,6 +109,17 @@ impl FastFieldDataAccess for Vec { fn iter<'b>(&'b self) -> Box + 'b> { Box::new((self as &[u64]).iter().cloned()) } + fn min_value(&self) -> u64 { + self.iter().min().unwrap_or(0) + } + + fn max_value(&self) -> u64 { + self.iter().max().unwrap_or(0) + } + + fn num_vals(&self) -> u64 { + self.len() as u64 + } } #[cfg(test)] @@ -103,12 +134,12 @@ mod tests { data: &[u64], name: &str, ) -> (f32, f32) { - if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) { + if !S::is_applicable(&data) { return (f32::MAX, 0.0); } - let estimation = S::estimate(&data, crate::tests::stats_from_vec(data)); + let estimation = S::estimate(&data); let mut out: Vec = Vec::new(); - S::serialize(&mut out, &data, crate::tests::stats_from_vec(data)).unwrap(); + S::serialize(&mut out, &data).unwrap(); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); @@ -184,29 +215,25 @@ mod tests { fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); - let linear_interpol_estimation = - LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data)); + let linear_interpol_estimation = LinearInterpolFastFieldSerializer::estimate(&data); assert_le!(linear_interpol_estimation, 0.01); let multi_linear_interpol_estimation = - MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data)); + MultiLinearInterpolFastFieldSerializer::estimate(&data); assert_le!(multi_linear_interpol_estimation, 0.2); assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); - let bitpacked_estimation = - BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data)); + let bitpacked_estimation = BitpackedFastFieldSerializer::estimate(&data); assert_le!(linear_interpol_estimation, bitpacked_estimation); } #[test] fn estimation_test_bad_interpolation_case() { let data = vec![200, 10, 10, 10, 10, 1000, 20]; - let linear_interpol_estimation = - LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data)); + let linear_interpol_estimation = LinearInterpolFastFieldSerializer::estimate(&data); assert_le!(linear_interpol_estimation, 0.32); - let bitpacked_estimation = - BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data)); + let bitpacked_estimation = BitpackedFastFieldSerializer::estimate(&data); assert_le!(bitpacked_estimation, linear_interpol_estimation); } #[test] @@ -216,12 +243,10 @@ mod tests { // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior - let linear_interpol_estimation = - LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data)); + let linear_interpol_estimation = LinearInterpolFastFieldSerializer::estimate(&data); assert_le!(linear_interpol_estimation, 0.35); - let bitpacked_estimation = - BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data)); + let bitpacked_estimation = BitpackedFastFieldSerializer::estimate(&data); assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, linear_interpol_estimation); } diff --git a/fastfield_codecs/src/linearinterpol.rs b/fastfield_codecs/src/linearinterpol.rs index 8905c8008..38c3fe011 100644 --- a/fastfield_codecs/src/linearinterpol.rs +++ b/fastfield_codecs/src/linearinterpol.rs @@ -5,7 +5,7 @@ use common::{BinarySerializable, FixedSize}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; -use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; +use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess}; /// Depending on the field type, a different /// fast field is required. @@ -139,13 +139,12 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { fn serialize( write: &mut impl Write, fastfield_accessor: &dyn FastFieldDataAccess, - stats: FastFieldStats, ) -> io::Result<()> { - assert!(stats.min_value <= stats.max_value); + assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value()); let first_val = fastfield_accessor.get_val(0); - let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1); - let slope = get_slope(first_val, last_val, stats.num_vals); + let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1); + let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals()); // calculate offset to ensure all values are positive let mut offset = 0; let mut rel_positive_max = 0; @@ -179,27 +178,25 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { offset, first_val, last_val, - num_vals: stats.num_vals, - min_value: stats.min_value, - max_value: stats.max_value, + num_vals: fastfield_accessor.num_vals(), + min_value: fastfield_accessor.min_value(), + max_value: fastfield_accessor.max_value(), }; footer.serialize(write)?; Ok(()) } - fn is_applicable( - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - ) -> bool { - if stats.num_vals < 3 { + fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool { + if fastfield_accessor.num_vals() < 3 { return false; // disable compressor for this case } // On serialisation the offset is added to the actual value. // We need to make sure this won't run into overflow calculation issues. // For this we take the maximum theroretical offset and add this to the max value. // If this doesn't overflow the algorithm should be fine - let theorethical_maximum_offset = stats.max_value - stats.min_value; - if stats - .max_value + let theorethical_maximum_offset = + fastfield_accessor.max_value() - fastfield_accessor.min_value(); + if fastfield_accessor + .max_value() .checked_add(theorethical_maximum_offset) .is_none() { @@ -210,13 +207,13 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { /// estimation for linear interpolation is hard because, you don't know /// where the local maxima for the deviation of the calculated value are and /// the offset to shift all values to >=0 is also unknown. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 { + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { let first_val = fastfield_accessor.get_val(0); - let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1); - let slope = get_slope(first_val, last_val, stats.num_vals); + let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1); + let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals()); // let's sample at 0%, 5%, 10% .. 95%, 100% - let num_vals = stats.num_vals as f32 / 100.0; + let num_vals = fastfield_accessor.num_vals() as f32 / 100.0; let sample_positions = (0..20) .map(|pos| (num_vals * pos as f32 * 5.0) as usize) .collect::>(); @@ -238,9 +235,10 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { // let relative_max_value = (max_distance as f32 * 1.5) * 2.0; - let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64 + let num_bits = compute_num_bits(relative_max_value as u64) as u64 + * fastfield_accessor.num_vals() + LinearInterpolFooter::SIZE_IN_BYTES as u64; - let num_bits_uncompressed = 64 * stats.num_vals; + let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); num_bits as f32 / num_bits_uncompressed as f32 } } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index fdb796c80..f5a8cb821 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -94,13 +94,13 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { pub fn serialize_with_codec( data: &[u64], ) -> (bool, f32, f32, &'static str) { - let is_applicable = S::is_applicable(&data, stats_from_vec(data)); + let is_applicable = S::is_applicable(&data); if !is_applicable { return (false, 0.0, 0.0, S::NAME); } - let estimation = S::estimate(&data, stats_from_vec(data)); + let estimation = S::estimate(&data); let mut out = vec![]; - S::serialize(&mut out, &data, stats_from_vec(data)).unwrap(); + S::serialize(&mut out, &data).unwrap(); let actual_compression = out.len() as f32 / (data.len() * 8) as f32; (true, estimation, actual_compression, S::NAME) diff --git a/fastfield_codecs/src/multilinearinterpol.rs b/fastfield_codecs/src/multilinearinterpol.rs index 1c1f57352..d156482a8 100644 --- a/fastfield_codecs/src/multilinearinterpol.rs +++ b/fastfield_codecs/src/multilinearinterpol.rs @@ -18,7 +18,7 @@ use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use crate::linearinterpol::{get_calculated_value, get_slope}; -use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; +use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess}; const CHUNK_SIZE: u64 = 512; @@ -188,15 +188,14 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { fn serialize( write: &mut impl Write, fastfield_accessor: &dyn FastFieldDataAccess, - stats: FastFieldStats, ) -> io::Result<()> { - assert!(stats.min_value <= stats.max_value); + assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value()); let first_val = fastfield_accessor.get_val(0); - let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1); + let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1); let mut first_function = Function { - end_pos: stats.num_vals, + end_pos: fastfield_accessor.num_vals(), value_start_pos: first_val, value_end_pos: last_val, ..Default::default() @@ -271,29 +270,27 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { bit_packer.close(write)?; let footer = MultiLinearInterpolFooter { - num_vals: stats.num_vals, - min_value: stats.min_value, - max_value: stats.max_value, + num_vals: fastfield_accessor.num_vals(), + min_value: fastfield_accessor.min_value(), + max_value: fastfield_accessor.max_value(), interpolations, }; footer.serialize(write)?; Ok(()) } - fn is_applicable( - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - ) -> bool { - if stats.num_vals < 5_000 { + fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool { + if fastfield_accessor.num_vals() < 5_000 { return false; } // On serialization the offset is added to the actual value. // We need to make sure this won't run into overflow calculation issues. // For this we take the maximum theroretical offset and add this to the max value. // If this doesn't overflow the algorithm should be fine - let theorethical_maximum_offset = stats.max_value - stats.min_value; - if stats - .max_value + let theorethical_maximum_offset = + fastfield_accessor.max_value() - fastfield_accessor.min_value(); + if fastfield_accessor + .max_value() .checked_add(theorethical_maximum_offset) .is_none() { @@ -304,15 +301,15 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { /// estimation for linear interpolation is hard because, you don't know /// where the local maxima are for the deviation of the calculated value and /// the offset is also unknown. - fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 { + fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 { let first_val_in_first_block = fastfield_accessor.get_val(0); - let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals); + let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals()); let last_val_in_first_block = fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1); let slope = get_slope( first_val_in_first_block, last_val_in_first_block, - stats.num_vals, + fastfield_accessor.num_vals(), ); // let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only @@ -339,10 +336,10 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { // let relative_max_value = (max_distance as f32 * 1.5) * 2.0; - let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64 + let num_bits = compute_num_bits(relative_max_value as u64) as u64 * fastfield_accessor.num_vals() as u64 // function metadata per block - + 29 * (stats.num_vals / CHUNK_SIZE); - let num_bits_uncompressed = 64 * stats.num_vals; + + 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE); + let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); num_bits as f32 / num_bits_uncompressed as f32 } } diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index c4883f97e..cd07e03ca 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -66,14 +66,13 @@ impl From for FastFieldCodecEnableCheck { // use this, when this is merged and stabilized explicit_generic_args_with_impl_trait // https://github.com/rust-lang/rust/pull/86176 fn codec_estimation( - stats: FastFieldStats, fastfield_accessor: &A, estimations: &mut Vec<(f32, &str, u8)>, ) { - if !T::is_applicable(fastfield_accessor, stats.clone()) { + if !T::is_applicable(fastfield_accessor) { return; } - let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID); + let (ratio, name, id) = (T::estimate(fastfield_accessor), T::NAME, T::ID); estimations.push((ratio, name, id)); } @@ -101,10 +100,9 @@ impl CompositeFastFieldSerializer { pub fn create_auto_detect_u64_fast_field( &mut self, field: Field, - stats: FastFieldStats, fastfield_accessor: impl FastFieldDataAccess, ) -> io::Result<()> { - self.create_auto_detect_u64_fast_field_with_idx(field, stats, fastfield_accessor, 0) + self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0) } /// Serialize data into a new u64 fast field. The best compression codec will be chosen @@ -120,12 +118,12 @@ impl CompositeFastFieldSerializer { pub fn create_auto_detect_u64_fast_field_with_idx( &mut self, field: Field, - stats: FastFieldStats, fastfield_accessor: impl FastFieldDataAccess, idx: usize, ) -> io::Result<()> { + let min_value = fastfield_accessor.min_value(); let field_write = self.composite_write.for_field_with_idx(field, idx); - let gcd = find_gcd(fastfield_accessor.iter().map(|val| val - stats.min_value)) + let gcd = find_gcd(fastfield_accessor.iter().map(|val| val - min_value)) .map(NonZeroU64::get) .unwrap_or(GCD_DEFAULT); @@ -134,7 +132,6 @@ impl CompositeFastFieldSerializer { self.codec_enable_checker.clone(), field, field_write, - stats, fastfield_accessor, ); } @@ -142,42 +139,54 @@ impl CompositeFastFieldSerializer { Self::write_header(field_write, GCD_CODEC_ID)?; struct GCDWrappedFFAccess { fastfield_accessor: T, - min_value: u64, + base_value: u64, + max_value: u64, + num_vals: u64, gcd: u64, } impl FastFieldDataAccess for GCDWrappedFFAccess { fn get_val(&self, position: u64) -> u64 { - (self.fastfield_accessor.get_val(position) - self.min_value) / self.gcd + (self.fastfield_accessor.get_val(position) - self.base_value) / self.gcd } fn iter<'b>(&'b self) -> Box + 'b> { Box::new( self.fastfield_accessor .iter() - .map(|val| (val - self.min_value) / self.gcd), + .map(|val| (val - self.base_value) / self.gcd), ) } + fn min_value(&self) -> u64 { + 0 + } + + fn max_value(&self) -> u64 { + self.max_value + } + + fn num_vals(&self) -> u64 { + self.num_vals + } } + let num_vals = fastfield_accessor.num_vals(); + let base_value = fastfield_accessor.min_value(); + let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd; + let fastfield_accessor = GCDWrappedFFAccess { fastfield_accessor, - min_value: stats.min_value, + base_value, + max_value, + num_vals, gcd, }; - let min_value = stats.min_value; - let stats = FastFieldStats { - min_value: 0, - max_value: (stats.max_value - stats.min_value) / gcd, - num_vals: stats.num_vals, - }; Self::create_auto_detect_u64_fast_field_with_idx_gcd( self.codec_enable_checker.clone(), field, field_write, - stats, fastfield_accessor, )?; - write_gcd_header(field_write, min_value, gcd)?; + write_gcd_header(field_write, base_value, gcd)?; Ok(()) } @@ -187,28 +196,24 @@ impl CompositeFastFieldSerializer { codec_enable_checker: FastFieldCodecEnableCheck, field: Field, field_write: &mut CountingWriter, - stats: FastFieldStats, fastfield_accessor: impl FastFieldDataAccess, ) -> io::Result<()> { let mut estimations = vec![]; if codec_enable_checker.is_enabled(FastFieldCodecName::Bitpacked) { codec_estimation::( - stats.clone(), &fastfield_accessor, &mut estimations, ); } if codec_enable_checker.is_enabled(FastFieldCodecName::LinearInterpol) { codec_estimation::( - stats.clone(), &fastfield_accessor, &mut estimations, ); } if codec_enable_checker.is_enabled(FastFieldCodecName::BlockwiseLinearInterpol) { codec_estimation::( - stats.clone(), &fastfield_accessor, &mut estimations, ); @@ -233,20 +238,15 @@ impl CompositeFastFieldSerializer { Self::write_header(field_write, id)?; match name { BitpackedFastFieldSerializer::NAME => { - BitpackedFastFieldSerializer::serialize(field_write, &fastfield_accessor, stats)?; + BitpackedFastFieldSerializer::serialize(field_write, &fastfield_accessor)?; } LinearInterpolFastFieldSerializer::NAME => { - LinearInterpolFastFieldSerializer::serialize( - field_write, - &fastfield_accessor, - stats, - )?; + LinearInterpolFastFieldSerializer::serialize(field_write, &fastfield_accessor)?; } MultiLinearInterpolFastFieldSerializer::NAME => { MultiLinearInterpolFastFieldSerializer::serialize( field_write, &fastfield_accessor, - stats, )?; } _ => { diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index d45722e16..a91bd8f72 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -359,17 +359,19 @@ impl IntFastFieldWriter { (self.val_min, self.val_max) }; - let fastfield_accessor = WriterFastFieldAccessProvider { - doc_id_map, - vals: &self.vals, - }; let stats = FastFieldStats { min_value: min, max_value: max, num_vals: self.val_count as u64, }; - serializer.create_auto_detect_u64_fast_field(self.field, stats, fastfield_accessor)?; + let fastfield_accessor = WriterFastFieldAccessProvider { + doc_id_map, + vals: &self.vals, + stats, + }; + + serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?; Ok(()) } @@ -379,6 +381,7 @@ impl IntFastFieldWriter { struct WriterFastFieldAccessProvider<'map, 'bitp> { doc_id_map: Option<&'map DocIdMapping>, vals: &'bitp BlockedBitpacker, + stats: FastFieldStats, } impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> { /// Return the value associated to the given doc. @@ -411,4 +414,16 @@ impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'b Box::new(self.vals.iter()) } } + + fn min_value(&self) -> u64 { + self.stats.min_value + } + + fn max_value(&self) -> u64 { + self.stats.max_value + } + + fn num_vals(&self) -> u64 { + self.stats.num_vals + } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index fa550df0b..bf9a64ede 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -374,6 +374,7 @@ impl IndexMerger { struct SortedDocIdFieldAccessProvider<'a> { doc_id_mapping: &'a SegmentDocIdMapping, fast_field_readers: &'a Vec>, + stats: FastFieldStats, } impl<'a> FastFieldDataAccess for SortedDocIdFieldAccessProvider<'a> { fn get_val(&self, doc: u64) -> u64 { @@ -395,16 +396,24 @@ impl IndexMerger { }), ) } + fn min_value(&self) -> u64 { + self.stats.min_value + } + + fn max_value(&self) -> u64 { + self.stats.max_value + } + + fn num_vals(&self) -> u64 { + self.stats.num_vals + } } let fastfield_accessor = SortedDocIdFieldAccessProvider { doc_id_mapping, fast_field_readers: &fast_field_readers, - }; - fast_field_serializer.create_auto_detect_u64_fast_field( - field, stats, - fastfield_accessor, - )?; + }; + fast_field_serializer.create_auto_detect_u64_fast_field(field, fastfield_accessor)?; Ok(()) } @@ -564,7 +573,37 @@ impl IndexMerger { } offsets.push(offset); - fast_field_serializer.create_auto_detect_u64_fast_field(field, stats, &offsets[..])?; + #[derive(Clone)] + struct FieldIndexAccessProvider<'a> { + offsets: &'a [u64], + stats: FastFieldStats, + } + impl<'a> FastFieldDataAccess for FieldIndexAccessProvider<'a> { + fn get_val(&self, doc: u64) -> u64 { + self.offsets[doc as usize] + } + + fn iter<'b>(&'b self) -> Box + 'b> { + Box::new(self.offsets.iter().cloned()) + } + fn min_value(&self) -> u64 { + self.stats.min_value + } + + fn max_value(&self) -> u64 { + self.stats.max_value + } + + fn num_vals(&self) -> u64 { + self.stats.num_vals + } + } + let fastfield_accessor = FieldIndexAccessProvider { + offsets: &offsets, + stats, + }; + + fast_field_serializer.create_auto_detect_u64_fast_field(field, fastfield_accessor)?; Ok(offsets) } /// Returns the fastfield index (index for the data, not the data). @@ -737,6 +776,7 @@ impl IndexMerger { doc_id_mapping: &'a SegmentDocIdMapping, fast_field_readers: &'a Vec>, offsets: Vec, + stats: FastFieldStats, } impl<'a> FastFieldDataAccess for SortedDocIdMultiValueAccessProvider<'a> { fn get_val(&self, pos: u64) -> u64 { @@ -777,15 +817,26 @@ impl IndexMerger { }), ) } + fn min_value(&self) -> u64 { + self.stats.min_value + } + + fn max_value(&self) -> u64 { + self.stats.max_value + } + + fn num_vals(&self) -> u64 { + self.stats.num_vals + } } let fastfield_accessor = SortedDocIdMultiValueAccessProvider { doc_id_mapping, fast_field_readers: &ff_readers, offsets, + stats, }; fast_field_serializer.create_auto_detect_u64_fast_field_with_idx( field, - stats, fastfield_accessor, 1, )?;