Add another multilinear interpolation and real world dataset.

This commit is contained in:
François Massot
2021-11-25 12:50:29 +01:00
parent 447811c111
commit 18d2ee5bb7
6 changed files with 590 additions and 51 deletions

1
fastfield_codecs/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
datasets/

View File

@@ -0,0 +1,6 @@
DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids
download:
@echo "--- Downloading datasets ---"
mkdir -p datasets
@for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done

View File

@@ -13,6 +13,9 @@ A codec needs to implement 2 traits:
- A reader implementing `FastFieldCodecReader` to read the codec.
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
### Download real world datasets for codecs comparison
Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3.
### Tests
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
@@ -23,46 +26,113 @@ cargo run --features bin
```
### TODO
- Add real world data sets in comparison
- Add codec to cover sparse data sets
### Codec Comparison
```
+----------------------------------+-------------------+------------------------+
| | Compression Ratio | Compression Estimation |
+----------------------------------+-------------------+------------------------+
| Autoincrement | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.000039572664 | 0.000004396963 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing concave | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.26562938 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.190665 | 0.1883836 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing convex | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.28125438 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.18676 | 0.2040086 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Almost monotonically increasing | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.14066513 | 0.1562544 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Autoincrement | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.000039572664 | 0.000004396963 | 731 | 0 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.022707172 | 0.17275847 | 1050 | 282 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.0051544965 | 0.17251475 | 963 | 216 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.28126493 | 0.28125 | 473 | 112 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Monotonically increasing concave | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.25003937 | 0.26562938 | 758 | 113 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.05403 | 0.1883836 | 902 | 223 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.005955 | 0.18813984 | 896 | 217 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.31251436 | 0.3125 | 476 | 113 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Monotonically increasing convex | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.25003937 | 0.28125438 | 757 | 113 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.053845 | 0.2040086 | 897 | 223 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.00613 | 0.20376484 | 912 | 217 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.31251436 | 0.3125 | 510 | 113 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Almost monotonically increasing | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.14066513 | 0.1406294 | 752 | 112 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.1472926 | 0.17275847 | 917 | 222 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.14537804 | 0.17251475 | 931 | 216 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.28126493 | 0.28125 | 468 | 112 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Random | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.14066513 | 0.1406294 | 769 | 112 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.1473328 | 0.14150847 | 960 | 222 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.14549863 | 0.14126475 | 907 | 216 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.12501445 | 0.125 | 412 | 114 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| HDFS logs timestamps | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.39063287 | 0.40625086 | 4808 | 606 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.39983186 | 0.40713495 | 5704 | 1197 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.39826187 | 0.4068908 | 5914 | 1167 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.39062786 | 0.390625 | 2768 | 618 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| HDFS logs timestamps SORTED | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.39063287 | 0.40625086 | 4118 | 573 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.034706876 | 0.09463495 | 5046 | 1118 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.032736875 | 0.094390824 | 5098 | 1164 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.39062786 | 0.390625 | 2802 | 606 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| HTTP logs timestamps SORTED | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.21875787 | 0.23437588 | 3953 | 567 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.050024875 | 0.20400995 | 4872 | 1179 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.047942877 | 0.20376582 | 5389 | 1163 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.26562786 | 0.265625 | 2572 | 618 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Amazon review product ids | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.40625787 | 0.42187586 | 4302 | 606 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.42106587 | 0.42275995 | 5720 | 1241 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.41900787 | 0.4225158 | 5285 | 1092 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.40625286 | 0.40625 | 2846 | 578 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Amazon review product ids SORTED | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| LinearInterpol | 0.35938287 | 0.39062586 | 4445 | 607 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpol | 0.18557687 | 0.25088495 | 5554 | 1312 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| MultiLinearInterpolV2 | 0.18364687 | 0.25064084 | 5371 | 1088 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.40625286 | 0.40625 | 2699 | 567 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
```

View File

@@ -8,6 +8,7 @@ use std::io::Write;
pub mod bitpacked;
pub mod linearinterpol;
pub mod multilinearinterpol;
pub mod multilinearinterpol_v2;
pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader

View File

@@ -1,25 +1,55 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
use fastfield_codecs::multilinearinterpol_v2::MultiLinearInterpolV2FastFieldReader;
use fastfield_codecs::FastFieldCodecReader;
use fastfield_codecs::{
linearinterpol::LinearInterpolFastFieldSerializer,
multilinearinterpol::MultiLinearInterpolFastFieldSerializer,
multilinearinterpol_v2::MultiLinearInterpolV2FastFieldSerializer, FastFieldCodecSerializer,
FastFieldStats,
};
use prettytable::{Cell, Row, Table};
use std::fs::File;
use std::io;
use std::io::BufRead;
use std::time::{Duration, Instant};
fn main() {
let mut table = Table::new();
// Add a row per time
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
table.add_row(row![
"",
"Compression ratio",
"Compression ratio estimation",
"Compression time (micro)",
"Reading time (micro)"
]);
for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![];
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
let res = serialize_with_codec::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(&data);
results.push(res);
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
let res = serialize_with_codec::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(&data);
results.push(res);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
&data,
);
let res = serialize_with_codec::<
MultiLinearInterpolV2FastFieldSerializer,
MultiLinearInterpolV2FastFieldReader,
>(&data);
results.push(res);
let res = serialize_with_codec::<
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
BitpackedFastFieldReader,
>(&data);
results.push(res);
// let best_estimation_codec = results
@@ -33,7 +63,7 @@ fn main() {
.unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (is_applicable, est, comp, name) in results {
for (is_applicable, est, comp, name, compression_duration, read_duration) in results {
let (est_cell, ratio_cell) = if !is_applicable {
("Codec Disabled".to_string(), "".to_string())
} else {
@@ -49,6 +79,8 @@ fn main() {
Cell::new(name).style_spec("bFg"),
Cell::new(&ratio_cell).style_spec(style),
Cell::new(&est_cell).style_spec(""),
Cell::new(&compression_duration.as_micros().to_string()),
Cell::new(&read_duration.as_micros().to_string()),
]));
}
}
@@ -88,16 +120,56 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
.collect::<Vec<_>>();
data_and_names.push((data, "Almost monotonically increasing"));
let data = (1000..=200_000_u64)
.map(|_| rand::random::<u8>() as u64)
.collect::<Vec<_>>();
data_and_names.push((data, "Random"));
let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt");
data_and_names.push((data.clone(), "HDFS logs timestamps"));
data.sort_unstable();
data_and_names.push((data, "HDFS logs timestamps SORTED"));
let data = load_dataset("datasets/http_logs_timestamps.txt");
data_and_names.push((data, "HTTP logs timestamps SORTED"));
let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt");
data_and_names.push((data.clone(), "Amazon review product ids"));
data.sort_unstable();
data_and_names.push((data, "Amazon review product ids SORTED"));
data_and_names
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
pub fn load_dataset(file_path: &str) -> Vec<u64> {
println!("Load dataset from `{}`", file_path);
let file = File::open(file_path).expect("Error when opening file.");
let lines = io::BufReader::new(file).lines();
let mut data = Vec::new();
for line in lines {
let l = line.unwrap();
data.push(l.parse::<u64>().unwrap());
}
data
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
data: &[u64],
) -> (bool, f32, f32, &'static str) {
) -> (bool, f32, f32, &'static str, Duration, Duration) {
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
if !is_applicable {
return (false, 0.0, 0.0, S::NAME);
return (
false,
0.0,
0.0,
S::NAME,
Duration::from_secs(0),
Duration::from_secs(0),
);
}
let start_time_compression = Instant::now();
let estimation = S::estimate(&data, stats_from_vec(data));
let mut out = vec![];
S::serialize(
@@ -108,9 +180,23 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
data.iter().cloned(),
)
.unwrap();
let elasped_time_compression = start_time_compression.elapsed();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
(true, estimation, actual_compression, S::NAME)
let reader = R::open_from_bytes(&out).unwrap();
let start_time_read = Instant::now();
for doc in 0..data.len() {
reader.get_u64(doc as u64, &out);
}
let elapsed_time_read = start_time_read.elapsed();
(
true,
estimation,
actual_compression,
S::NAME,
elasped_time_compression,
elapsed_time_read,
)
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -0,0 +1,375 @@
/*!
MultiLinearInterpolV2 compressor uses blockwise linear interpolation to guess values and stores the difference between the actual value
and the one given by the linear interpolation.
This is done for every block of 512 values. For every block, our linear function can be expressed as
`computed_value = slope * block_position + first_value + positive_offset`
where:
- `block_position` is the position inside of the block from 0 to 511
- `first_value` is the first value on the block
- `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is always positive.
21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits per element.
*/
use crate::FastFieldCodecReader;
use crate::FastFieldCodecSerializer;
use crate::FastFieldDataAccess;
use crate::FastFieldStats;
use std::io::{self, Read, Write};
use std::ops::Sub;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
use common::BinarySerializable;
use common::DeserializeFrom;
use tantivy_bitpacker::BitUnpacker;
const BLOCK_SIZE: u64 = 512;
#[derive(Clone)]
pub struct MultiLinearInterpolV2FastFieldReader {
num_vals: u64,
min_value: u64,
max_value: u64,
block_readers: Vec<BlockReader>,
}
/// Block metadata needed to define the linear function `y = a.x + b`
/// and to bitpack the difference between the real value and the
/// the linear function computed value where:
/// - `a` is the `slope`
/// - `b` is the sum of the `first_value` in the block + an offset
/// `positive_offset` which ensures that difference between the real
/// value and the linear function computed value is always positive.
#[derive(Clone, Debug, Default)]
struct BlockMetadata {
first_value: u64,
positive_offset: u64,
slope: f32,
num_bits: u8,
}
#[derive(Clone, Debug, Default)]
struct BlockReader {
metadata: BlockMetadata,
start_offset: u64,
bit_unpacker: BitUnpacker,
}
impl BlockReader {
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
Self {
bit_unpacker: BitUnpacker::new(metadata.num_bits),
metadata,
start_offset,
}
}
#[inline]
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
let diff = self.bit_unpacker.get(block_pos, &data[self.start_offset as usize..]);
let computed_value = get_computed_value(self.metadata.first_value, block_pos, self.metadata.slope);
(computed_value + diff) - self.metadata.positive_offset
}
}
impl BinarySerializable for BlockMetadata {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.first_value.serialize(write)?;
self.positive_offset.serialize(write)?;
self.slope.serialize(write)?;
self.num_bits.serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let constant = u64::deserialize(reader)?;
let constant_positive_offset = u64::deserialize(reader)?;
let slope = f32::deserialize(reader)?;
let num_bits = u8::deserialize(reader)?;
Ok(Self {
first_value: constant,
positive_offset: constant_positive_offset,
slope,
num_bits,
})
}
}
#[derive(Clone, Debug)]
pub struct MultiLinearInterpolV2Footer {
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
block_metadatas: Vec<BlockMetadata>,
}
impl BinarySerializable for MultiLinearInterpolV2Footer {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
let mut out = vec![];
self.num_vals.serialize(&mut out)?;
self.min_value.serialize(&mut out)?;
self.max_value.serialize(&mut out)?;
self.block_metadatas.serialize(&mut out)?;
write.write_all(&out)?;
(out.len() as u32).serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let footer = Self {
num_vals: u64::deserialize(reader)?,
min_value: u64::deserialize(reader)?,
max_value: u64::deserialize(reader)?,
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
};
Ok(footer)
}
}
impl FastFieldCodecReader for MultiLinearInterpolV2FastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
let footer = MultiLinearInterpolV2Footer::deserialize(&mut footer)?;
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
let mut current_data_offset = 0;
for block_metadata in footer.block_metadatas.into_iter() {
let num_bits = block_metadata.num_bits;
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
}
Ok(Self {
num_vals: footer.num_vals,
min_value: footer.min_value,
max_value: footer.max_value,
block_readers
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let block_idx = (doc / BLOCK_SIZE) as usize;
let block_pos = doc - (block_idx as u64) * BLOCK_SIZE;
let block_reader = &self.block_readers[block_idx];
block_reader.get_u64(block_pos, data)
}
#[inline]
fn min_value(&self) -> u64 {
self.min_value
}
#[inline]
fn max_value(&self) -> u64 {
self.max_value
}
}
#[inline]
fn get_computed_value(first_val: u64, pos: u64, slope: f32) -> u64 {
(first_val as i64 + (pos as f32 * slope) as i64) as u64
}
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct MultiLinearInterpolV2FastFieldSerializer {}
impl FastFieldCodecSerializer for MultiLinearInterpolV2FastFieldSerializer {
const NAME: &'static str = "MultiLinearInterpolV2";
const ID: u8 = 4;
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
_: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
_data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
let mut data = data_iter.collect::<Vec<_>>();
let mut bit_packer = BitPacker::new();
let mut block_metadatas = Vec::new();
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals];
let slope = if block_num_vals == 1 {
0f32
} else {
((block_values[block_values.len() - 1] as f64 - block_values[0] as f64) / (block_num_vals - 1) as f64) as f32
};
let first_value = block_values[0];
let mut positive_offset = 0;
let mut max_delta = 0;
for (pos, &current_value) in block_values[1..]
.iter()
.enumerate()
{
let computed_value = get_computed_value(first_value, pos as u64 + 1, slope);
if computed_value > current_value {
positive_offset = positive_offset.max(computed_value - current_value);
} else {
max_delta = max_delta.max(current_value - computed_value);
}
}
let num_bits = compute_num_bits(max_delta + positive_offset);
for (pos, current_value) in block_values.iter().enumerate()
{
let computed_value = get_computed_value(first_value, pos as u64, slope);
let diff = (current_value + positive_offset) - computed_value;
bit_packer.write(diff, num_bits, write)?;
}
bit_packer.flush(write)?;
block_metadatas.push(BlockMetadata {
first_value,
positive_offset,
slope,
num_bits,
});
}
bit_packer.close(write)?;
let footer = MultiLinearInterpolV2Footer {
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
block_metadatas,
};
footer.serialize(write)?;
Ok(())
}
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> bool {
if stats.num_vals < 10 * BLOCK_SIZE {
return false;
}
// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
// If this doesn't overflow the algortihm should be fine
let theorethical_maximum_offset = stats.max_value - stats.min_value;
if stats
.max_value
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
}
true
}
/// Estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
let last_val_in_first_block =
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64) / (stats.num_vals - 1) as f64) as f32;
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
let sample_positions = (0..20)
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
.collect::<Vec<_>>();
let max_distance = sample_positions
.iter()
.map(|&pos| {
let calculated_value =
get_computed_value(first_val_in_first_block, pos as u64, slope);
let actual_value = fastfield_accessor.get_val(pos as u64);
distance(calculated_value, actual_value)
})
.max()
.unwrap();
// Estimate one block and extrapolate the cost to all blocks.
// the theory would be that we don't have the actual max_distance, but we are close within 50%
// threshold.
// It is multiplied by 2 because in a log case scenario the line would be as much above as
// below. So the offset would = max_distance
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
// function metadata per block
+ 21 * (stats.num_vals / BLOCK_SIZE);
let num_bits_uncompressed = 64 * stats.num_vals;
num_bits as f32 / num_bits_uncompressed as f32
}
}
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
if x < y {
y - x
} else {
x - y
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
MultiLinearInterpolV2FastFieldSerializer,
MultiLinearInterpolV2FastFieldReader,
>(data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
assert!(actual_compression > 0.001);
}
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
create_and_validate(&data, name);
}
}
#[test]
fn test_simple() {
let data = (10..=20_u64).collect::<Vec<_>>();
create_and_validate(&data, "simple monotonically");
}
#[test]
fn border_cases_1() {
let data = (0..1024).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn border_case_2() {
let data = (0..1025).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn rand() {
for _ in 0..10 {
let mut data = (5_000..20_000)
.map(|_| rand::random::<u32>() as u64)
.collect::<Vec<_>>();
let (estimate, actual_compression) = create_and_validate(&data, "random");
dbg!(estimate);
dbg!(actual_compression);
data.reverse();
create_and_validate(&data, "random");
}
}
}