From 4d66a3f0a011a633cf1ad94d50f02f248a81dfaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Wed, 8 Dec 2021 12:02:06 +0100 Subject: [PATCH] Put deprecated attributes on deprecated codecs. Clean. --- fastfield_codecs/Cargo.toml | 4 +- fastfield_codecs/benches/bench.rs | 28 +++-------- fastfield_codecs/src/lib.rs | 54 ++++++--------------- fastfield_codecs/src/linearinterpol.rs | 6 +++ fastfield_codecs/src/main.rs | 32 ++++-------- fastfield_codecs/src/multilinearinterpol.rs | 5 ++ fastfield_codecs/src/piecewise_linear.rs | 38 +++++++-------- src/fastfield/mod.rs | 2 +- src/fastfield/reader.rs | 17 ++----- src/fastfield/serializer/mod.rs | 41 +--------------- 10 files changed, 68 insertions(+), 159 deletions(-) diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index a8540142d..3f427dae1 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -6,8 +6,6 @@ license = "MIT" edition = "2018" description = "Fast field codecs used by tantivy" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] common = { version = "0.2", path = "../common/", package = "tantivy-common" } tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" } @@ -19,6 +17,6 @@ more-asserts = "0.2.1" rand = "0.8.3" [features] +unstable = [] # useful for benches and experimental codecs. bin = ["prettytable-rs", "rand"] default = ["bin"] - diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 768037d00..454b48514 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -4,14 +4,10 @@ extern crate test; #[cfg(test)] mod tests { - use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}; - use fastfield_codecs::linearinterpol::{ - LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer, + use fastfield_codecs::{ + bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}, + *, piecewise_linear::{PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader}, }; - use fastfield_codecs::multilinearinterpol::{ - MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, - }; - use fastfield_codecs::*; fn get_data() -> Vec { let mut data: Vec<_> = (100..55000_u64) @@ -70,14 +66,9 @@ mod tests { bench_create::(b, &data); } #[bench] - fn bench_fastfield_linearinterpol_create(b: &mut Bencher) { + fn bench_fastfield_piecewise_linear_create(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_create::(b, &data); - } - #[bench] - fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_create::(b, &data); + bench_create::(b, &data); } #[bench] fn bench_fastfield_bitpack_get(b: &mut Bencher) { @@ -85,14 +76,9 @@ mod tests { bench_get::(b, &data); } #[bench] - fn bench_fastfield_linearinterpol_get(b: &mut Bencher) { + fn bench_fastfield_piecewise_linear_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::(b, &data); - } - #[bench] - fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { - let data: Vec<_> = get_data(); - bench_get::( + bench_get::( b, &data, ); } diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index caef67fe5..354f742af 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -6,6 +6,7 @@ use std::io; use std::io::Write; pub mod bitpacked; +#[cfg(feature = "unstable")] pub mod frame_of_reference; pub mod linearinterpol; pub mod multilinearinterpol; @@ -93,11 +94,6 @@ impl FastFieldDataAccess for Vec { mod tests { use crate::{ bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}, - frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer}, - linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer}, - multilinearinterpol::{ - MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, - }, piecewise_linear::{PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer}, }; @@ -168,25 +164,12 @@ mod tests { fn test_codec_bitpacking() { test_codec::(); } - #[test] - fn test_codec_interpolation() { - test_codec::(); - } - #[test] - fn test_codec_multi_interpolation() { - test_codec::(); - } #[test] fn test_codec_piecewise_linear() { test_codec::(); } - #[test] - fn test_codec_for() { - test_codec::(); - } - use super::*; pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { let min_value = data.iter().cloned().min().unwrap_or(0); @@ -202,57 +185,50 @@ mod tests { fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); - let linear_interpol_estimation = - LinearInterpolFastFieldSerializer::estimate_compression_ratio( + let piecewise_interpol_estimation = + PiecewiseLinearFastFieldSerializer::estimate_compression_ratio( &data, stats_from_vec(&data), ); - assert_le!(linear_interpol_estimation, 0.01); - - let multi_linear_interpol_estimation = - MultiLinearInterpolFastFieldSerializer::estimate_compression_ratio( - &data, - stats_from_vec(&data), - ); - assert_le!(multi_linear_interpol_estimation, 0.2); - assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); + assert_le!(piecewise_interpol_estimation, 0.2); let bitpacked_estimation = BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data)); - assert_le!(linear_interpol_estimation, bitpacked_estimation); + assert_le!(piecewise_interpol_estimation, bitpacked_estimation); } #[test] fn estimation_test_bad_interpolation_case() { let data = vec![200, 10, 10, 10, 10, 1000, 20]; - let linear_interpol_estimation = - LinearInterpolFastFieldSerializer::estimate_compression_ratio( + let piecewise_interpol_estimation = + PiecewiseLinearFastFieldSerializer::estimate_compression_ratio( &data, stats_from_vec(&data), ); - assert_le!(linear_interpol_estimation, 0.32); + assert_le!(piecewise_interpol_estimation, 0.32); let bitpacked_estimation = BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data)); - assert_le!(bitpacked_estimation, linear_interpol_estimation); + assert_le!(bitpacked_estimation, piecewise_interpol_estimation); } #[test] - fn estimation_test_bad_interpolation_case_monotonically_increasing() { + fn estimation_test_interpolation_case_monotonically_increasing() { let mut data = (200..=20000_u64).collect::>(); data.push(1_000_000); // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior - let linear_interpol_estimation = - LinearInterpolFastFieldSerializer::estimate_compression_ratio( + let piecewise_interpol_estimation = + PiecewiseLinearFastFieldSerializer::estimate_compression_ratio( &data, stats_from_vec(&data), ); - assert_le!(linear_interpol_estimation, 0.35); + assert_le!(piecewise_interpol_estimation, 0.2); let bitpacked_estimation = BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data)); + println!("{}", bitpacked_estimation); assert_le!(bitpacked_estimation, 0.32); - assert_le!(bitpacked_estimation, linear_interpol_estimation); + assert_le!(piecewise_interpol_estimation, bitpacked_estimation); } } diff --git a/fastfield_codecs/src/linearinterpol.rs b/fastfield_codecs/src/linearinterpol.rs index 268b519a3..e7482f257 100644 --- a/fastfield_codecs/src/linearinterpol.rs +++ b/fastfield_codecs/src/linearinterpol.rs @@ -88,6 +88,10 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader { /// Fastfield serializer, which tries to guess values by linear interpolation /// and stores the difference bitpacked. +/// +#[deprecated( + note = "Linear interpolation works best only on very rare cases and piecewise linear codec already works great on them." +)] pub struct LinearInterpolFastFieldSerializer {} #[inline] @@ -105,6 +109,7 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 { first_val + (pos as f32 * slope) as u64 } +#[allow(deprecated)] impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { const NAME: &'static str = "LinearInterpol"; const ID: u8 = 2; @@ -235,6 +240,7 @@ fn distance + Ord>(x: T, y: T) -> T { } } +#[allow(deprecated)] #[cfg(test)] mod tests { use super::*; diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index d48666ad2..feefd6c55 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,18 +1,14 @@ #[macro_use] extern crate prettytable; +use common::f64_to_u64; use fastfield_codecs::bitpacked::BitpackedFastFieldReader; +#[cfg(feature = "unstable")] use fastfield_codecs::frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer}; -use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader; -use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader; use fastfield_codecs::piecewise_linear::{ PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer, }; use fastfield_codecs::FastFieldCodecReader; -use fastfield_codecs::{ - linearinterpol::LinearInterpolFastFieldSerializer, - multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer, - FastFieldStats, -}; +use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats}; use prettytable::{Cell, Row, Table}; use rand::prelude::StdRng; use rand::Rng; @@ -35,23 +31,16 @@ fn main() { for (data, data_set_name) in get_codec_test_data_sets() { let mut results = vec![]; - let res = serialize_with_codec::< - LinearInterpolFastFieldSerializer, - LinearInterpolFastFieldReader, - >(&data); - results.push(res); - let res = serialize_with_codec::< - MultiLinearInterpolFastFieldSerializer, - MultiLinearInterpolFastFieldReader, - >(&data); - results.push(res); let res = serialize_with_codec::< PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader, >(&data); results.push(res); - let res = serialize_with_codec::(&data); - results.push(res); + #[cfg(feature = "unstable")] + { + let res = serialize_with_codec::(&data); + results.push(res); + } let res = serialize_with_codec::< fastfield_codecs::bitpacked::BitpackedFastFieldSerializer, BitpackedFastFieldReader, @@ -168,9 +157,7 @@ pub fn load_float_dataset(file_path: &str) -> Vec { for line in lines { let line_string = line.unwrap(); let value = line_string.parse::().unwrap(); - let bytes = value.to_le_bytes(); - let u64_value = u64::from_le_bytes(bytes); - data.push(u64_value); + data.push(f64_to_u64(value)); } data } @@ -202,7 +189,6 @@ pub fn serialize_with_codec u64 { } /// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements. +#[deprecated( + note = "MultiLinearInterpol is replaced by PiecewiseLinear codec which fixes the slope and is a little bit more optimized." +)] pub struct MultiLinearInterpolFastFieldSerializer {} +#[allow(deprecated)] impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { const NAME: &'static str = "MultiLinearInterpol"; const ID: u8 = 3; @@ -372,6 +376,7 @@ fn distance + Ord>(x: T, y: T) -> T { } #[cfg(test)] +#[allow(deprecated)] mod tests { use super::*; use crate::tests::get_codec_test_data_sets; diff --git a/fastfield_codecs/src/piecewise_linear.rs b/fastfield_codecs/src/piecewise_linear.rs index 5b8c85980..b6faf9a79 100644 --- a/fastfield_codecs/src/piecewise_linear.rs +++ b/fastfield_codecs/src/piecewise_linear.rs @@ -1,7 +1,7 @@ /*! -PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to guess values and stores the -difference between the actual value and the one given by the linear interpolation. +PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to predict values +and fast field values. The difference with real fast field values is then stored. For every block, the linear function can be expressed as `computed_value = slope * block_position + first_value + positive_offset` where: @@ -36,13 +36,10 @@ pub struct PiecewiseLinearFastFieldReader { block_readers: Vec, } -/// Block metadata needed to define the linear function `y = a.x + b` -/// and to bitpack the difference between the real value and the -/// the linear function computed value where: -/// - `a` is the `slope` -/// - `b` is the sum of the `first_value` in the block + an offset -/// `positive_offset` which ensures that difference between the real -/// value and the linear function computed value is always positive. +/// Block that stores metadata to predict value with a linear +/// function `predicted_value = slope * position + first_value + positive_offset` +/// where `positive_offset` is comupted such that predicted values +/// are always positive. #[derive(Clone, Debug, Default)] struct BlockMetadata { first_value: u64, @@ -72,9 +69,9 @@ impl BlockReader { let diff = self .bit_unpacker .get(block_pos, &data[self.start_offset as usize..]); - let computed_value = - get_computed_value(self.metadata.first_value, block_pos, self.metadata.slope); - (computed_value + diff) - self.metadata.positive_offset + let predicted_value = + predict_value(self.metadata.first_value, block_pos, self.metadata.slope); + (predicted_value + diff) - self.metadata.positive_offset } } @@ -88,13 +85,13 @@ impl BinarySerializable for BlockMetadata { } fn deserialize(reader: &mut R) -> io::Result { - let constant = u64::deserialize(reader)?; - let constant_positive_offset = u64::deserialize(reader)?; + let first_value = u64::deserialize(reader)?; + let positive_offset = u64::deserialize(reader)?; let slope = f32::deserialize(reader)?; let num_bits = u8::deserialize(reader)?; Ok(Self { - first_value: constant, - positive_offset: constant_positive_offset, + first_value, + positive_offset, slope, num_bits, }) @@ -172,7 +169,7 @@ impl FastFieldCodecReader for PiecewiseLinearFastFieldReader { } #[inline] -fn get_computed_value(first_val: u64, pos: u64, slope: f32) -> u64 { +fn predict_value(first_val: u64, pos: u64, slope: f32) -> u64 { (first_val as i64 + (pos as f32 * slope) as i64) as u64 } @@ -205,7 +202,7 @@ impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer { let mut positive_offset = 0; let mut max_delta = 0; for (pos, ¤t_value) in block_values[1..].iter().enumerate() { - let computed_value = get_computed_value(first_value, pos as u64 + 1, slope); + let computed_value = predict_value(first_value, pos as u64 + 1, slope); if computed_value > current_value { positive_offset = positive_offset.max(computed_value - current_value); } else { @@ -214,7 +211,7 @@ impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer { } let num_bits = compute_num_bits(max_delta + positive_offset); for (pos, current_value) in block_values.iter().enumerate() { - let computed_value = get_computed_value(first_value, pos as u64, slope); + let computed_value = predict_value(first_value, pos as u64, slope); let diff = (current_value + positive_offset) - computed_value; bit_packer.write(diff, num_bits, write)?; } @@ -282,8 +279,7 @@ impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer { let max_distance = sample_positions .iter() .map(|&pos| { - let calculated_value = - get_computed_value(first_val_in_first_block, pos as u64, slope); + let calculated_value = predict_value(first_val_in_first_block, pos as u64, slope); let actual_value = fastfield_accessor.get_val(pos as u64); distance(calculated_value, actual_value) }) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 4c1ed8875..615f6aa6d 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -392,7 +392,7 @@ mod tests { serializer.close().unwrap(); } let file = directory.open_read(path).unwrap(); - assert_eq!(file.len(), 9597_usize); // FOR codec size + assert_eq!(file.len(), 12471_usize); // Piecewise linear codec size { let fast_fields_composite = CompositeFile::open(&file)?; let data = fast_fields_composite.open_read(i64_field).unwrap(); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 393b8c30c..7baf3ed5f 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,5 +1,3 @@ -use fastfield_codecs::frame_of_reference::FORFastFieldReader; -use fastfield_codecs::frame_of_reference::FORFastFieldSerializer; use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldReader; use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer; use std::collections::HashMap; @@ -10,11 +8,13 @@ use common::BinarySerializable; use fastfield_codecs::bitpacked::{ BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer, }; +#[allow(deprecated)] use fastfield_codecs::linearinterpol::{ LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer, }; +#[allow(deprecated)] use fastfield_codecs::multilinearinterpol::{ - MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, + MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader, }; use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer}; @@ -77,8 +77,6 @@ pub enum DynamicFastFieldReader { MultiLinearInterpol(FastFieldReaderCodecWrapper), /// Piecewise linear interpolated values + bitpacked PiecewiseLinear(FastFieldReaderCodecWrapper), - /// Frame of reference values + bitpacked - FOR(FastFieldReaderCodecWrapper), } impl DynamicFastFieldReader { @@ -94,12 +92,14 @@ impl DynamicFastFieldReader { BitpackedReader, >::open_from_bytes(bytes)?) } + #[allow(deprecated)] LinearInterpolFastFieldSerializer::ID => { DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::< Item, LinearInterpolFastFieldReader, >::open_from_bytes(bytes)?) } + #[allow(deprecated)] MultiLinearInterpolFastFieldSerializer::ID => { DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::< Item, @@ -114,9 +114,6 @@ impl DynamicFastFieldReader { PiecewiseLinearFastFieldReader, >::open_from_bytes(bytes)?) } - FORFastFieldSerializer::ID => DynamicFastFieldReader::FOR( - FastFieldReaderCodecWrapper::::open_from_bytes(bytes)?, - ), _ => { panic!( "unknown fastfield id {:?}. Data corrupted or using old tantivy version.", @@ -136,7 +133,6 @@ impl FastFieldReader for DynamicFastFieldReader { Self::LinearInterpol(reader) => reader.get(doc), Self::MultiLinearInterpol(reader) => reader.get(doc), Self::PiecewiseLinear(reader) => reader.get(doc), - Self::FOR(reader) => reader.get(doc), } } #[inline] @@ -146,7 +142,6 @@ impl FastFieldReader for DynamicFastFieldReader { Self::LinearInterpol(reader) => reader.get_range(start, output), Self::MultiLinearInterpol(reader) => reader.get_range(start, output), Self::PiecewiseLinear(reader) => reader.get_range(start, output), - Self::FOR(reader) => reader.get_range(start, output), } } fn min_value(&self) -> Item { @@ -155,7 +150,6 @@ impl FastFieldReader for DynamicFastFieldReader { Self::LinearInterpol(reader) => reader.min_value(), Self::MultiLinearInterpol(reader) => reader.min_value(), Self::PiecewiseLinear(reader) => reader.min_value(), - Self::FOR(reader) => reader.min_value(), } } fn max_value(&self) -> Item { @@ -164,7 +158,6 @@ impl FastFieldReader for DynamicFastFieldReader { Self::LinearInterpol(reader) => reader.max_value(), Self::MultiLinearInterpol(reader) => reader.max_value(), Self::PiecewiseLinear(reader) => reader.max_value(), - Self::FOR(reader) => reader.max_value(), } } } diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 605a907bb..62aa3a240 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -5,9 +5,6 @@ use common::BinarySerializable; use common::CountingWriter; pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy; -use fastfield_codecs::frame_of_reference::FORFastFieldSerializer; -use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; -use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer; pub use fastfield_codecs::FastFieldCodecSerializer; pub use fastfield_codecs::FastFieldDataAccess; @@ -114,11 +111,6 @@ impl CompositeFastFieldSerializer { stats.clone(), &fastfield_accessor, )); - estimations.push(codec_estimation::( - stats.clone(), - &fastfield_accessor, - )); - println!("{:?}", estimations); let best_codec_result = estimations .iter() .sorted_by(|result_a, result_b| { @@ -144,24 +136,6 @@ impl CompositeFastFieldSerializer { data_iter_2, )?; } - LinearInterpolFastFieldSerializer::NAME => { - LinearInterpolFastFieldSerializer::serialize( - field_write, - &fastfield_accessor, - stats, - data_iter_1, - data_iter_2, - )?; - } - MultiLinearInterpolFastFieldSerializer::NAME => { - MultiLinearInterpolFastFieldSerializer::serialize( - field_write, - &fastfield_accessor, - stats, - data_iter_1, - data_iter_2, - )?; - } PiecewiseLinearFastFieldSerializer::NAME => { PiecewiseLinearFastFieldSerializer::serialize( field_write, @@ -171,15 +145,6 @@ impl CompositeFastFieldSerializer { data_iter_2, )?; } - FORFastFieldSerializer::NAME => { - FORFastFieldSerializer::serialize( - field_write, - &fastfield_accessor, - stats, - data_iter_1, - data_iter_2, - )?; - } _ => { panic!("unknown fastfield serializer {}", best_codec_result.name) } @@ -285,10 +250,8 @@ mod tests { // get the codecs id let mut bytes = directory.open_read(path)?.read_bytes()?; let codec_id = u8::deserialize(&mut bytes)?; - // Codec id = 1 is bitpacking - assert_eq!(codec_id, 5); - //let reader = FastFieldReaderCodecWrapper::::open(file_slice)?; - //assert_eq!(reader.get_u64(0), 0); + // Codec id = 4 is piecewise linear. + assert_eq!(codec_id, 4); Ok(()) } }