diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index d57f78b79..b77591279 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -22,6 +22,7 @@ impl Column for BitpackedReader { } #[inline] fn min_value(&self) -> u64 { + // The BitpackedReader assumes a normalized vector. 0 } #[inline] @@ -58,19 +59,24 @@ impl FastFieldCodec for BitpackedCodec { /// Serializes data with the BitpackedFastFieldSerializer. /// + /// The bitpacker assumes that the column has been normalized. + /// i.e. It has already been shifted by its minimum value, so that its + /// current minimum value is 0. + /// /// Ideally, we made a shift upstream on the column so that `col.min_value() == 0`. - fn serialize(col: &dyn Column, write: &mut impl Write) -> io::Result<()> { - let num_bits = compute_num_bits(col.max_value()); + fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> { + assert_eq!(column.min_value(), 0u64); + let num_bits = compute_num_bits(column.max_value()); let mut bit_packer = BitPacker::new(); - for val in col.iter() { + for val in column.iter() { bit_packer.write(val, num_bits, write)?; } bit_packer.close(write)?; Ok(()) } - fn estimate(col: &impl Column) -> Option { - let num_bits = compute_num_bits(col.max_value()); + fn estimate(column: &impl Column) -> Option { + let num_bits = compute_num_bits(column.max_value()); let num_bits_uncompressed = 64; Some(num_bits as f32 / num_bits_uncompressed as f32) } diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 02b4124a2..f6053d097 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -71,14 +71,11 @@ impl FastFieldCodec for BlockwiseLinearCodec { } // Estimate first_chunk and extrapolate - fn estimate(fastfield_accessor: &impl crate::Column) -> Option { - if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE as u64 { + fn estimate(column: &impl crate::Column) -> Option { + if column.num_vals() < 10 * CHUNK_SIZE as u64 { return None; } - let mut first_chunk: Vec = fastfield_accessor - .iter() - .take(CHUNK_SIZE as usize) - .collect(); + let mut first_chunk: Vec = column.iter().take(CHUNK_SIZE as usize).collect(); let line = Line::train(&VecColumn::from(&first_chunk)); for (i, buffer_val) in first_chunk.iter_mut().enumerate() { let interpolated_val = line.eval(i as u64); @@ -96,24 +93,23 @@ impl FastFieldCodec for BlockwiseLinearCodec { Block::default().serialize(&mut out).unwrap(); out.len() }; - let num_bits = estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64 + let num_bits = estimated_bit_width as u64 * column.num_vals() as u64 // function metadata per block - + metadata_per_block as u64 * (fastfield_accessor.num_vals() / CHUNK_SIZE as u64); - let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); + + metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64); + let num_bits_uncompressed = 64 * column.num_vals(); Some(num_bits as f32 / num_bits_uncompressed as f32) } - fn serialize( - fastfield_accessor: &dyn crate::Column, - wrt: &mut impl io::Write, - ) -> io::Result<()> { + fn serialize(column: &dyn crate::Column, wrt: &mut impl io::Write) -> io::Result<()> { + // The BitpackedReader assumes a normalized vector. + assert_eq!(column.min_value(), 0); let mut buffer = Vec::with_capacity(CHUNK_SIZE); - let num_vals = fastfield_accessor.num_vals(); + let num_vals = column.num_vals(); let num_blocks = compute_num_blocks(num_vals); let mut blocks = Vec::with_capacity(num_blocks); - let mut vals = fastfield_accessor.iter(); + let mut vals = column.iter(); let mut bit_packer = BitPacker::new(); @@ -176,6 +172,7 @@ impl Column for BlockwiseLinearReader { } fn min_value(&self) -> u64 { + // The BlockwiseLinearReader assumes a normalized vector. 0u64 } diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 03bb91c36..dcaea689d 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -34,16 +34,18 @@ pub trait Column { /// Returns the minimum value for this fast field. /// - /// The min value does not take in account of possible - /// deleted document, and should be considered as a lower bound - /// of the actual minimum value. + /// This min_value may not be exact. + /// For instance, the min value does not take in account of possible + /// deleted document. All values are however guaranteed to be higher than + /// `.min_value()`. fn min_value(&self) -> T; /// Returns the maximum value for this fast field. /// - /// The max value does not take in account of possible - /// deleted document, and should be considered as an upper bound - /// of the actual maximum value + /// This max_value may not be exact. + /// For instance, the max value does not take in account of possible + /// deleted document. All values are however guaranteed to be higher than + /// `.max_value()`. fn max_value(&self) -> T; fn num_vals(&self) -> u64; diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 215acfd64..e17ca3b6b 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -13,17 +13,17 @@ use std::io::Write; use common::BinarySerializable; use ownedbytes::OwnedBytes; -pub mod bitpacked; -pub mod blockwise_linear; +mod bitpacked; +mod blockwise_linear; pub(crate) mod line; -pub mod linear; +mod linear; mod column; mod gcd; mod serialize; pub use self::column::{monotonic_map_column, Column, VecColumn}; -pub use self::serialize::{open, serialize, serialize_and_load, NormalizedHeader}; +pub use self::serialize::{estimate, open, serialize, serialize_and_load, NormalizedHeader}; #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] #[repr(u8)] @@ -124,7 +124,7 @@ impl MonotonicallyMappableToU64 for f64 { /// The FastFieldSerializerEstimate trait is required on all variants /// of fast field compressions, to decide which one to choose. -pub trait FastFieldCodec: 'static { +trait FastFieldCodec: 'static { /// A codex needs to provide a unique name and id, which is /// used for debugging and de/serialization. const CODEC_TYPE: FastFieldCodecType; diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index 5cade79b3..e9aed1e86 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -27,6 +27,7 @@ impl Column for LinearReader { #[inline] fn min_value(&self) -> u64 { + // The LinearReader assumes a normalized vector. 0u64 } @@ -84,11 +85,11 @@ impl FastFieldCodec for LinearCodec { } /// Creates a new fast field serializer. - fn serialize(fastfield_accessor: &dyn Column, write: &mut impl Write) -> io::Result<()> { - assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value()); - let line = Line::train(fastfield_accessor); + fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> { + assert_eq!(column.min_value(), 0); + let line = Line::train(column); - let max_offset_from_line = fastfield_accessor + let max_offset_from_line = column .iter() .enumerate() .map(|(pos, actual_value)| { @@ -106,7 +107,7 @@ impl FastFieldCodec for LinearCodec { linear_params.serialize(write)?; let mut bit_packer = BitPacker::new(); - for (pos, actual_value) in fastfield_accessor.iter().enumerate() { + for (pos, actual_value) in column.iter().enumerate() { let calculated_value = line.eval(pos as u64); let offset = actual_value.wrapping_sub(calculated_value); bit_packer.write(offset, num_bits, write)?; @@ -120,23 +121,23 @@ impl FastFieldCodec for LinearCodec { /// where the local maxima for the deviation of the calculated value are and /// the offset to shift all values to >=0 is also unknown. #[allow(clippy::question_mark)] - fn estimate(fastfield_accessor: &impl Column) -> Option { - if fastfield_accessor.num_vals() < 3 { + fn estimate(column: &impl Column) -> Option { + if column.num_vals() < 3 { return None; // disable compressor for this case } // let's sample at 0%, 5%, 10% .. 95%, 100% - let num_vals = fastfield_accessor.num_vals() as f32 / 100.0; + let num_vals = column.num_vals() as f32 / 100.0; let sample_positions = (0..20) .map(|pos| (num_vals * pos as f32 * 5.0) as u64) .collect::>(); - let line = Line::estimate(fastfield_accessor, &sample_positions); + let line = Line::estimate(column, &sample_positions); let estimated_bit_width = sample_positions .into_iter() .map(|pos| { - let actual_value = fastfield_accessor.get_val(pos); + let actual_value = column.get_val(pos); let interpolated_val = line.eval(pos as u64); actual_value.wrapping_sub(interpolated_val) }) @@ -145,8 +146,8 @@ impl FastFieldCodec for LinearCodec { .max() .unwrap_or(0); - let num_bits = (estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64) + 64; - let num_bits_uncompressed = 64 * fastfield_accessor.num_vals(); + let num_bits = (estimated_bit_width as u64 * column.num_vals() as u64) + 64; + let num_bits_uncompressed = 64 * column.num_vals(); Some(num_bits as f32 / num_bits_uncompressed as f32) } } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 2b9ff969b..91f18649f 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,35 +1,8 @@ #[macro_use] extern crate prettytable; -use fastfield_codecs::bitpacked::BitpackedCodec; -use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; -use fastfield_codecs::linear::LinearCodec; -use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats}; +use fastfield_codecs::{Column, FastFieldCodecType, FastFieldStats, VecColumn}; use prettytable::{Cell, Row, Table}; -struct Data<'a>(&'a [u64]); - -impl<'a> Column for Data<'a> { - fn get_val(&self, position: u64) -> u64 { - self.0[position as usize] - } - - fn iter<'b>(&'b self) -> Box + 'b> { - Box::new(self.0.iter().cloned()) - } - - fn min_value(&self) -> u64 { - *self.0.iter().min().unwrap_or(&0) - } - - fn max_value(&self) -> u64 { - *self.0.iter().max().unwrap_or(&0) - } - - fn num_vals(&self) -> u64 { - self.0.len() as u64 - } -} - fn main() { let mut table = Table::new(); @@ -38,10 +11,9 @@ fn main() { for (data, data_set_name) in get_codec_test_data_sets() { let results: Vec<(f32, f32, FastFieldCodecType)> = [ - serialize_with_codec::(&data), - serialize_with_codec::(&data), - serialize_with_codec::(&data), - serialize_with_codec::(&data), + serialize_with_codec(&data, FastFieldCodecType::Bitpacked), + serialize_with_codec(&data, FastFieldCodecType::Linear), + serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear), ] .into_iter() .flatten() @@ -107,15 +79,16 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { data_and_names } -pub fn serialize_with_codec( +pub fn serialize_with_codec( data: &[u64], + codec_type: FastFieldCodecType, ) -> Option<(f32, f32, FastFieldCodecType)> { - let data = Data(data); - let estimation = C::estimate(&data)?; + let col = VecColumn::from(data); + let estimation = fastfield_codecs::estimate(&col, codec_type)?; let mut out = Vec::new(); - C::serialize(&data, &mut out).unwrap(); - let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32; - Some((estimation, actual_compression, C::CODEC_TYPE)) + fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?; + let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32; + Some((estimation, actual_compression, codec_type)) } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index ff117f987..28ed40d76 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -34,17 +34,11 @@ use crate::{ VecColumn, ALL_CODEC_TYPES, }; -// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait -// https://github.com/rust-lang/rust/pull/86176 -fn codec_estimation( - fastfield_accessor: &D, - estimations: &mut Vec<(f32, FastFieldCodecType)>, -) { - if let Some(ratio) = C::estimate(fastfield_accessor) { - estimations.push((ratio, C::CODEC_TYPE)); - } -} - +/// The normalized header gives some parameters after applying the following +/// normalization of the vector: +/// val -> (val - min_value) / gcd +/// +/// By design, after normalization, `min_value = 0` and `gcd = 1`. #[derive(Debug, Copy, Clone)] pub struct NormalizedHeader { pub num_vals: u64, @@ -160,6 +154,23 @@ fn open_specific_codec( } } +pub fn estimate( + typed_column: impl Column, + codec_type: FastFieldCodecType, +) -> Option { + let column = monotonic_map_column(typed_column, T::to_u64); + let min_value = column.min_value(); + let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value)) + .filter(|gcd| gcd.get() > 1u64); + let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64)); + let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value)); + match codec_type { + FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column), + FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column), + FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column), + } +} + pub fn serialize( typed_column: impl Column, output: &mut impl io::Write, @@ -188,16 +199,13 @@ fn detect_codec( ) -> Option { let mut estimations = Vec::new(); for &codec in codecs { - match codec { - FastFieldCodecType::Bitpacked => { - codec_estimation::(&column, &mut estimations); - } - FastFieldCodecType::Linear => { - codec_estimation::(&column, &mut estimations); - } - FastFieldCodecType::BlockwiseLinear => { - codec_estimation::(&column, &mut estimations); - } + let estimation_opt = match codec { + FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&column), + FastFieldCodecType::Linear => LinearCodec::estimate(&column), + FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&column), + }; + if let Some(estimation) = estimation_opt { + estimations.push((estimation, codec)); } } if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) { diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index da0284e3a..a8bd244ab 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -332,13 +332,11 @@ mod tests { #[test] fn test_multivalue_get_vals() { - let doc_id_mapping = DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let doc_id_mapping = + DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); assert_eq!(doc_id_mapping.num_old_doc_ids(), 10); - let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,][..]); - let multivalue_start_index = MultivalueStartIndex::new( - &col, - &doc_id_mapping, - ); + let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55][..]); + let multivalue_start_index = MultivalueStartIndex::new(&col, &doc_id_mapping); assert_eq!( multivalue_start_index.iter().collect::>(), vec![0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55] @@ -351,5 +349,4 @@ mod tests { assert_eq!(multivalue_start_index.get_val(0), 0); assert_eq!(multivalue_start_index.get_val(10), 55); } - } diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 1447c0455..5d88adf9e 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -1,8 +1,7 @@ use std::io::{self, Write}; use common::{BinarySerializable, CountingWriter}; -pub use fastfield_codecs::bitpacked::BitpackedCodec; -pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats}; +pub use fastfield_codecs::{Column, FastFieldStats}; use fastfield_codecs::{FastFieldCodecType, MonotonicallyMappableToU64, ALL_CODEC_TYPES}; use crate::directory::{CompositeWrite, WritePtr};