diff --git a/fastfield_codecs/.gitignore b/fastfield_codecs/.gitignore new file mode 100644 index 000000000..3133a1308 --- /dev/null +++ b/fastfield_codecs/.gitignore @@ -0,0 +1 @@ +datasets/ diff --git a/fastfield_codecs/Makefile b/fastfield_codecs/Makefile new file mode 100644 index 000000000..1caeaebcc --- /dev/null +++ b/fastfield_codecs/Makefile @@ -0,0 +1,6 @@ +DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids +download: + @echo "--- Downloading datasets ---" + mkdir -p datasets + @for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done + diff --git a/fastfield_codecs/README.md b/fastfield_codecs/README.md index 0358fa2cb..84207206d 100644 --- a/fastfield_codecs/README.md +++ b/fastfield_codecs/README.md @@ -13,6 +13,9 @@ A codec needs to implement 2 traits: - A reader implementing `FastFieldCodecReader` to read the codec. - A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id. +### Download real world datasets for codecs comparison +Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3. + ### Tests Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`). @@ -23,46 +26,113 @@ cargo run --features bin ``` ### TODO -- Add real world data sets in comparison - Add codec to cover sparse data sets ### Codec Comparison ``` -+----------------------------------+-------------------+------------------------+ -| | Compression Ratio | Compression Estimation | -+----------------------------------+-------------------+------------------------+ -| Autoincrement | | | -+----------------------------------+-------------------+------------------------+ -| LinearInterpol | 0.000039572664 | 0.000004396963 | -+----------------------------------+-------------------+------------------------+ -| MultiLinearInterpol | 0.1477348 | 0.17275847 | -+----------------------------------+-------------------+------------------------+ -| Bitpacked | 0.28126493 | 0.28125 | -+----------------------------------+-------------------+------------------------+ -| Monotonically increasing concave | | | -+----------------------------------+-------------------+------------------------+ -| LinearInterpol | 0.25003937 | 0.26562938 | -+----------------------------------+-------------------+------------------------+ -| MultiLinearInterpol | 0.190665 | 0.1883836 | -+----------------------------------+-------------------+------------------------+ -| Bitpacked | 0.31251436 | 0.3125 | -+----------------------------------+-------------------+------------------------+ -| Monotonically increasing convex | | | -+----------------------------------+-------------------+------------------------+ -| LinearInterpol | 0.25003937 | 0.28125438 | -+----------------------------------+-------------------+------------------------+ -| MultiLinearInterpol | 0.18676 | 0.2040086 | -+----------------------------------+-------------------+------------------------+ -| Bitpacked | 0.31251436 | 0.3125 | -+----------------------------------+-------------------+------------------------+ -| Almost monotonically increasing | | | -+----------------------------------+-------------------+------------------------+ -| LinearInterpol | 0.14066513 | 0.1562544 | -+----------------------------------+-------------------+------------------------+ -| MultiLinearInterpol | 0.16335973 | 0.17275847 | -+----------------------------------+-------------------+------------------------+ -| Bitpacked | 0.28126493 | 0.28125 | -+----------------------------------+-------------------+------------------------+ ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Autoincrement | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.000039572664 | 0.000004396963 | 731 | 0 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.022707172 | 0.17275847 | 1050 | 282 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.0051544965 | 0.17251475 | 963 | 216 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.28126493 | 0.28125 | 473 | 112 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Monotonically increasing concave | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.25003937 | 0.26562938 | 758 | 113 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.05403 | 0.1883836 | 902 | 223 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.005955 | 0.18813984 | 896 | 217 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.31251436 | 0.3125 | 476 | 113 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Monotonically increasing convex | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.25003937 | 0.28125438 | 757 | 113 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.053845 | 0.2040086 | 897 | 223 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.00613 | 0.20376484 | 912 | 217 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.31251436 | 0.3125 | 510 | 113 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Almost monotonically increasing | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.14066513 | 0.1406294 | 752 | 112 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.1472926 | 0.17275847 | 917 | 222 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.14537804 | 0.17251475 | 931 | 216 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.28126493 | 0.28125 | 468 | 112 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Random | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.14066513 | 0.1406294 | 769 | 112 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.1473328 | 0.14150847 | 960 | 222 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.14549863 | 0.14126475 | 907 | 216 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.12501445 | 0.125 | 412 | 114 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| HDFS logs timestamps | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.39063287 | 0.40625086 | 4808 | 606 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.39983186 | 0.40713495 | 5704 | 1197 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.39826187 | 0.4068908 | 5914 | 1167 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.39062786 | 0.390625 | 2768 | 618 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| HDFS logs timestamps SORTED | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.39063287 | 0.40625086 | 4118 | 573 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.034706876 | 0.09463495 | 5046 | 1118 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.032736875 | 0.094390824 | 5098 | 1164 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.39062786 | 0.390625 | 2802 | 606 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| HTTP logs timestamps SORTED | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.21875787 | 0.23437588 | 3953 | 567 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.050024875 | 0.20400995 | 4872 | 1179 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.047942877 | 0.20376582 | 5389 | 1163 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.26562786 | 0.265625 | 2572 | 618 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Amazon review product ids | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.40625787 | 0.42187586 | 4302 | 606 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.42106587 | 0.42275995 | 5720 | 1241 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.41900787 | 0.4225158 | 5285 | 1092 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.40625286 | 0.40625 | 2846 | 578 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Amazon review product ids SORTED | | | | | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| LinearInterpol | 0.35938287 | 0.39062586 | 4445 | 607 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpol | 0.18557687 | 0.25088495 | 5554 | 1312 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| MultiLinearInterpolV2 | 0.18364687 | 0.25064084 | 5371 | 1088 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ +| Bitpacked | 0.40625286 | 0.40625 | 2699 | 567 | ++----------------------------------+-------------------+------------------------------+--------------------------+----------------------+ ``` diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 9285321ea..baa88b087 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -8,6 +8,7 @@ use std::io::Write; pub mod bitpacked; pub mod linearinterpol; pub mod multilinearinterpol; +pub mod multilinearinterpol_v2; pub trait FastFieldCodecReader: Sized { /// reads the metadata and returns the CodecReader diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 18fef5c60..b7bd5f1ee 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,25 +1,55 @@ #[macro_use] extern crate prettytable; -use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; -use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; -use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats}; +use fastfield_codecs::bitpacked::BitpackedFastFieldReader; +use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader; +use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader; +use fastfield_codecs::multilinearinterpol_v2::MultiLinearInterpolV2FastFieldReader; +use fastfield_codecs::FastFieldCodecReader; +use fastfield_codecs::{ + linearinterpol::LinearInterpolFastFieldSerializer, + multilinearinterpol::MultiLinearInterpolFastFieldSerializer, + multilinearinterpol_v2::MultiLinearInterpolV2FastFieldSerializer, FastFieldCodecSerializer, + FastFieldStats, +}; use prettytable::{Cell, Row, Table}; +use std::fs::File; +use std::io; +use std::io::BufRead; +use std::time::{Duration, Instant}; fn main() { let mut table = Table::new(); // Add a row per time - table.add_row(row!["", "Compression Ratio", "Compression Estimation"]); + table.add_row(row![ + "", + "Compression ratio", + "Compression ratio estimation", + "Compression time (micro)", + "Reading time (micro)" + ]); for (data, data_set_name) in get_codec_test_data_sets() { let mut results = vec![]; - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::< + LinearInterpolFastFieldSerializer, + LinearInterpolFastFieldReader, + >(&data); results.push(res); - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::< + MultiLinearInterpolFastFieldSerializer, + MultiLinearInterpolFastFieldReader, + >(&data); results.push(res); - let res = serialize_with_codec::( - &data, - ); + let res = serialize_with_codec::< + MultiLinearInterpolV2FastFieldSerializer, + MultiLinearInterpolV2FastFieldReader, + >(&data); + results.push(res); + let res = serialize_with_codec::< + fastfield_codecs::bitpacked::BitpackedFastFieldSerializer, + BitpackedFastFieldReader, + >(&data); results.push(res); // let best_estimation_codec = results @@ -33,7 +63,7 @@ fn main() { .unwrap(); table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")])); - for (is_applicable, est, comp, name) in results { + for (is_applicable, est, comp, name, compression_duration, read_duration) in results { let (est_cell, ratio_cell) = if !is_applicable { ("Codec Disabled".to_string(), "".to_string()) } else { @@ -49,6 +79,8 @@ fn main() { Cell::new(name).style_spec("bFg"), Cell::new(&ratio_cell).style_spec(style), Cell::new(&est_cell).style_spec(""), + Cell::new(&compression_duration.as_micros().to_string()), + Cell::new(&read_duration.as_micros().to_string()), ])); } } @@ -88,16 +120,56 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { .collect::>(); data_and_names.push((data, "Almost monotonically increasing")); + let data = (1000..=200_000_u64) + .map(|_| rand::random::() as u64) + .collect::>(); + data_and_names.push((data, "Random")); + + let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt"); + data_and_names.push((data.clone(), "HDFS logs timestamps")); + + data.sort_unstable(); + data_and_names.push((data, "HDFS logs timestamps SORTED")); + + let data = load_dataset("datasets/http_logs_timestamps.txt"); + data_and_names.push((data, "HTTP logs timestamps SORTED")); + + let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt"); + data_and_names.push((data.clone(), "Amazon review product ids")); + + data.sort_unstable(); + data_and_names.push((data, "Amazon review product ids SORTED")); + data_and_names } -pub fn serialize_with_codec( +pub fn load_dataset(file_path: &str) -> Vec { + println!("Load dataset from `{}`", file_path); + let file = File::open(file_path).expect("Error when opening file."); + let lines = io::BufReader::new(file).lines(); + let mut data = Vec::new(); + for line in lines { + let l = line.unwrap(); + data.push(l.parse::().unwrap()); + } + data +} + +pub fn serialize_with_codec( data: &[u64], -) -> (bool, f32, f32, &'static str) { +) -> (bool, f32, f32, &'static str, Duration, Duration) { let is_applicable = S::is_applicable(&data, stats_from_vec(data)); if !is_applicable { - return (false, 0.0, 0.0, S::NAME); + return ( + false, + 0.0, + 0.0, + S::NAME, + Duration::from_secs(0), + Duration::from_secs(0), + ); } + let start_time_compression = Instant::now(); let estimation = S::estimate(&data, stats_from_vec(data)); let mut out = vec![]; S::serialize( @@ -108,9 +180,23 @@ pub fn serialize_with_codec( data.iter().cloned(), ) .unwrap(); - + let elasped_time_compression = start_time_compression.elapsed(); let actual_compression = out.len() as f32 / (data.len() * 8) as f32; - (true, estimation, actual_compression, S::NAME) + + let reader = R::open_from_bytes(&out).unwrap(); + let start_time_read = Instant::now(); + for doc in 0..data.len() { + reader.get_u64(doc as u64, &out); + } + let elapsed_time_read = start_time_read.elapsed(); + ( + true, + estimation, + actual_compression, + S::NAME, + elasped_time_compression, + elapsed_time_read, + ) } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { diff --git a/fastfield_codecs/src/multilinearinterpol_v2.rs b/fastfield_codecs/src/multilinearinterpol_v2.rs new file mode 100644 index 000000000..da7057396 --- /dev/null +++ b/fastfield_codecs/src/multilinearinterpol_v2.rs @@ -0,0 +1,375 @@ +/*! + +MultiLinearInterpolV2 compressor uses blockwise linear interpolation to guess values and stores the difference between the actual value +and the one given by the linear interpolation. +This is done for every block of 512 values. For every block, our linear function can be expressed as +`computed_value = slope * block_position + first_value + positive_offset` +where: +- `block_position` is the position inside of the block from 0 to 511 +- `first_value` is the first value on the block +- `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is always positive. + +21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits per element. + +*/ + +use crate::FastFieldCodecReader; +use crate::FastFieldCodecSerializer; +use crate::FastFieldDataAccess; +use crate::FastFieldStats; +use std::io::{self, Read, Write}; +use std::ops::Sub; +use tantivy_bitpacker::compute_num_bits; +use tantivy_bitpacker::BitPacker; + +use common::BinarySerializable; +use common::DeserializeFrom; +use tantivy_bitpacker::BitUnpacker; + +const BLOCK_SIZE: u64 = 512; + +#[derive(Clone)] +pub struct MultiLinearInterpolV2FastFieldReader { + num_vals: u64, + min_value: u64, + max_value: u64, + block_readers: Vec, +} + +/// Block metadata needed to define the linear function `y = a.x + b` +/// and to bitpack the difference between the real value and the +/// the linear function computed value where: +/// - `a` is the `slope` +/// - `b` is the sum of the `first_value` in the block + an offset +/// `positive_offset` which ensures that difference between the real +/// value and the linear function computed value is always positive. +#[derive(Clone, Debug, Default)] +struct BlockMetadata { + first_value: u64, + positive_offset: u64, + slope: f32, + num_bits: u8, +} + +#[derive(Clone, Debug, Default)] +struct BlockReader { + metadata: BlockMetadata, + start_offset: u64, + bit_unpacker: BitUnpacker, +} + +impl BlockReader { + fn new(metadata: BlockMetadata, start_offset: u64) -> Self { + Self { + bit_unpacker: BitUnpacker::new(metadata.num_bits), + metadata, + start_offset, + } + } + + #[inline] + fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 { + let diff = self.bit_unpacker.get(block_pos, &data[self.start_offset as usize..]); + let computed_value = get_computed_value(self.metadata.first_value, block_pos, self.metadata.slope); + (computed_value + diff) - self.metadata.positive_offset + } +} + +impl BinarySerializable for BlockMetadata { + fn serialize(&self, write: &mut W) -> io::Result<()> { + self.first_value.serialize(write)?; + self.positive_offset.serialize(write)?; + self.slope.serialize(write)?; + self.num_bits.serialize(write)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let constant = u64::deserialize(reader)?; + let constant_positive_offset = u64::deserialize(reader)?; + let slope = f32::deserialize(reader)?; + let num_bits = u8::deserialize(reader)?; + Ok(Self { + first_value: constant, + positive_offset: constant_positive_offset, + slope, + num_bits, + }) + } +} + +#[derive(Clone, Debug)] +pub struct MultiLinearInterpolV2Footer { + pub num_vals: u64, + pub min_value: u64, + pub max_value: u64, + block_metadatas: Vec, +} + +impl BinarySerializable for MultiLinearInterpolV2Footer { + fn serialize(&self, write: &mut W) -> io::Result<()> { + let mut out = vec![]; + self.num_vals.serialize(&mut out)?; + self.min_value.serialize(&mut out)?; + self.max_value.serialize(&mut out)?; + self.block_metadatas.serialize(&mut out)?; + write.write_all(&out)?; + (out.len() as u32).serialize(write)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let footer = Self { + num_vals: u64::deserialize(reader)?, + min_value: u64::deserialize(reader)?, + max_value: u64::deserialize(reader)?, + block_metadatas: Vec::::deserialize(reader)?, + }; + Ok(footer) + } +} + +impl FastFieldCodecReader for MultiLinearInterpolV2FastFieldReader { + /// Opens a fast field given a file. + fn open_from_bytes(bytes: &[u8]) -> io::Result { + let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; + let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize); + let footer = MultiLinearInterpolV2Footer::deserialize(&mut footer)?; + let mut block_readers = Vec::with_capacity(footer.block_metadatas.len()); + let mut current_data_offset = 0; + for block_metadata in footer.block_metadatas.into_iter() { + let num_bits = block_metadata.num_bits; + block_readers.push(BlockReader::new(block_metadata, current_data_offset)); + current_data_offset += num_bits as u64 * BLOCK_SIZE / 8; + } + Ok(Self { + num_vals: footer.num_vals, + min_value: footer.min_value, + max_value: footer.max_value, + block_readers + }) + } + + #[inline] + fn get_u64(&self, doc: u64, data: &[u8]) -> u64 { + let block_idx = (doc / BLOCK_SIZE) as usize; + let block_pos = doc - (block_idx as u64) * BLOCK_SIZE; + let block_reader = &self.block_readers[block_idx]; + block_reader.get_u64(block_pos, data) + } + + #[inline] + fn min_value(&self) -> u64 { + self.min_value + } + #[inline] + fn max_value(&self) -> u64 { + self.max_value + } +} + +#[inline] +fn get_computed_value(first_val: u64, pos: u64, slope: f32) -> u64 { + (first_val as i64 + (pos as f32 * slope) as i64) as u64 +} + +/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements. +pub struct MultiLinearInterpolV2FastFieldSerializer {} + +impl FastFieldCodecSerializer for MultiLinearInterpolV2FastFieldSerializer { + const NAME: &'static str = "MultiLinearInterpolV2"; + const ID: u8 = 4; + /// Creates a new fast field serializer. + fn serialize( + write: &mut impl Write, + _: &impl FastFieldDataAccess, + stats: FastFieldStats, + data_iter: impl Iterator, + _data_iter1: impl Iterator, + ) -> io::Result<()> { + let mut data = data_iter.collect::>(); + let mut bit_packer = BitPacker::new(); + let mut block_metadatas = Vec::new(); + for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) { + let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize; + let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals]; + let slope = if block_num_vals == 1 { + 0f32 + } else { + ((block_values[block_values.len() - 1] as f64 - block_values[0] as f64) / (block_num_vals - 1) as f64) as f32 + }; + let first_value = block_values[0]; + let mut positive_offset = 0; + let mut max_delta = 0; + for (pos, ¤t_value) in block_values[1..] + .iter() + .enumerate() + { + let computed_value = get_computed_value(first_value, pos as u64 + 1, slope); + if computed_value > current_value { + positive_offset = positive_offset.max(computed_value - current_value); + } else { + max_delta = max_delta.max(current_value - computed_value); + } + } + let num_bits = compute_num_bits(max_delta + positive_offset); + for (pos, current_value) in block_values.iter().enumerate() + { + let computed_value = get_computed_value(first_value, pos as u64, slope); + let diff = (current_value + positive_offset) - computed_value; + bit_packer.write(diff, num_bits, write)?; + } + bit_packer.flush(write)?; + block_metadatas.push(BlockMetadata { + first_value, + positive_offset, + slope, + num_bits, + }); + } + bit_packer.close(write)?; + + let footer = MultiLinearInterpolV2Footer { + num_vals: stats.num_vals, + min_value: stats.min_value, + max_value: stats.max_value, + block_metadatas, + }; + footer.serialize(write)?; + Ok(()) + } + + fn is_applicable( + _fastfield_accessor: &impl FastFieldDataAccess, + stats: FastFieldStats, + ) -> bool { + if stats.num_vals < 10 * BLOCK_SIZE { + return false; + } + // On serialization the offset is added to the actual value. + // We need to make sure this won't run into overflow calculation issues. + // For this we take the maximum theroretical offset and add this to the max value. + // If this doesn't overflow the algortihm should be fine + let theorethical_maximum_offset = stats.max_value - stats.min_value; + if stats + .max_value + .checked_add(theorethical_maximum_offset) + .is_none() + { + return false; + } + true + } + + /// Estimation for linear interpolation is hard because, you don't know + /// where the local maxima are for the deviation of the calculated value and + /// the offset is also unknown. + fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 { + let first_val_in_first_block = fastfield_accessor.get_val(0); + let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals); + let last_val_in_first_block = + fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1); + let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64) / (stats.num_vals - 1) as f64) as f32; + + // let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only + let sample_positions = (0..20) + .map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize) + .collect::>(); + + let max_distance = sample_positions + .iter() + .map(|&pos| { + let calculated_value = + get_computed_value(first_val_in_first_block, pos as u64, slope); + let actual_value = fastfield_accessor.get_val(pos as u64); + distance(calculated_value, actual_value) + }) + .max() + .unwrap(); + + // Estimate one block and extrapolate the cost to all blocks. + // the theory would be that we don't have the actual max_distance, but we are close within 50% + // threshold. + // It is multiplied by 2 because in a log case scenario the line would be as much above as + // below. So the offset would = max_distance + let relative_max_value = (max_distance as f32 * 1.5) * 2.0; + + let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64 + // function metadata per block + + 21 * (stats.num_vals / BLOCK_SIZE); + let num_bits_uncompressed = 64 * stats.num_vals; + num_bits as f32 / num_bits_uncompressed as f32 + } +} + +fn distance + Ord>(x: T, y: T) -> T { + if x < y { + y - x + } else { + x - y + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::get_codec_test_data_sets; + + fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { + crate::tests::create_and_validate::< + MultiLinearInterpolV2FastFieldSerializer, + MultiLinearInterpolV2FastFieldReader, + >(data, name) + } + + #[test] + fn test_compression() { + let data = (10..=6_000_u64).collect::>(); + let (estimate, actual_compression) = + create_and_validate(&data, "simple monotonically large"); + assert!(actual_compression < 0.2); + assert!(estimate < 0.20); + assert!(estimate > 0.15); + assert!(actual_compression > 0.001); + } + + #[test] + fn test_with_codec_data_sets() { + let data_sets = get_codec_test_data_sets(); + for (mut data, name) in data_sets { + create_and_validate(&data, name); + data.reverse(); + create_and_validate(&data, name); + } + } + #[test] + fn test_simple() { + let data = (10..=20_u64).collect::>(); + create_and_validate(&data, "simple monotonically"); + } + + #[test] + fn border_cases_1() { + let data = (0..1024).collect::>(); + create_and_validate(&data, "border case"); + } + #[test] + fn border_case_2() { + let data = (0..1025).collect::>(); + create_and_validate(&data, "border case"); + } + #[test] + fn rand() { + for _ in 0..10 { + let mut data = (5_000..20_000) + .map(|_| rand::random::() as u64) + .collect::>(); + let (estimate, actual_compression) = create_and_validate(&data, "random"); + dbg!(estimate); + dbg!(actual_compression); + + data.reverse(); + create_and_validate(&data, "random"); + } + } +}