mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 21:20:40 +00:00
Add another multilinear interpolation and real world dataset.
This commit is contained in:
1
fastfield_codecs/.gitignore
vendored
Normal file
1
fastfield_codecs/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
datasets/
|
||||
6
fastfield_codecs/Makefile
Normal file
6
fastfield_codecs/Makefile
Normal file
@@ -0,0 +1,6 @@
|
||||
DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids
|
||||
download:
|
||||
@echo "--- Downloading datasets ---"
|
||||
mkdir -p datasets
|
||||
@for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done
|
||||
|
||||
@@ -13,6 +13,9 @@ A codec needs to implement 2 traits:
|
||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||
|
||||
### Download real world datasets for codecs comparison
|
||||
Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3.
|
||||
|
||||
### Tests
|
||||
|
||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||
@@ -23,46 +26,113 @@ cargo run --features bin
|
||||
```
|
||||
|
||||
### TODO
|
||||
- Add real world data sets in comparison
|
||||
- Add codec to cover sparse data sets
|
||||
|
||||
|
||||
### Codec Comparison
|
||||
```
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| | Compression Ratio | Compression Estimation |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Autoincrement | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing concave | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing convex | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Almost monotonically increasing | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Autoincrement | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 | 731 | 0 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.022707172 | 0.17275847 | 1050 | 282 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.0051544965 | 0.17251475 | 963 | 216 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 473 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing concave | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 | 758 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.05403 | 0.1883836 | 902 | 223 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.005955 | 0.18813984 | 896 | 217 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 476 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing convex | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 | 757 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.053845 | 0.2040086 | 897 | 223 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.00613 | 0.20376484 | 912 | 217 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 510 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Almost monotonically increasing | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1406294 | 752 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.1472926 | 0.17275847 | 917 | 222 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.14537804 | 0.17251475 | 931 | 216 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 468 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Random | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1406294 | 769 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.1473328 | 0.14150847 | 960 | 222 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.14549863 | 0.14126475 | 907 | 216 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.12501445 | 0.125 | 412 | 114 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.39063287 | 0.40625086 | 4808 | 606 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.39983186 | 0.40713495 | 5704 | 1197 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.39826187 | 0.4068908 | 5914 | 1167 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2768 | 618 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.39063287 | 0.40625086 | 4118 | 573 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.034706876 | 0.09463495 | 5046 | 1118 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.032736875 | 0.094390824 | 5098 | 1164 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2802 | 606 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HTTP logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.21875787 | 0.23437588 | 3953 | 567 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.050024875 | 0.20400995 | 4872 | 1179 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.047942877 | 0.20376582 | 5389 | 1163 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.26562786 | 0.265625 | 2572 | 618 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.40625787 | 0.42187586 | 4302 | 606 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.42106587 | 0.42275995 | 5720 | 1241 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.41900787 | 0.4225158 | 5285 | 1092 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2846 | 578 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| LinearInterpol | 0.35938287 | 0.39062586 | 4445 | 607 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpol | 0.18557687 | 0.25088495 | 5554 | 1312 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| MultiLinearInterpolV2 | 0.18364687 | 0.25064084 | 5371 | 1088 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2699 | 567 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
|
||||
```
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::io::Write;
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
pub mod multilinearinterpol_v2;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
|
||||
@@ -1,25 +1,55 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::multilinearinterpol_v2::MultiLinearInterpolV2FastFieldReader;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use fastfield_codecs::{
|
||||
linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol::MultiLinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol_v2::MultiLinearInterpolV2FastFieldSerializer, FastFieldCodecSerializer,
|
||||
FastFieldStats,
|
||||
};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::BufRead;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
table.add_row(row![
|
||||
"",
|
||||
"Compression ratio",
|
||||
"Compression ratio estimation",
|
||||
"Compression time (micro)",
|
||||
"Reading time (micro)"
|
||||
]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
let res = serialize_with_codec::<
|
||||
LinearInterpolFastFieldSerializer,
|
||||
LinearInterpolFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
let res = serialize_with_codec::<
|
||||
MultiLinearInterpolFastFieldSerializer,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
let res = serialize_with_codec::<
|
||||
MultiLinearInterpolV2FastFieldSerializer,
|
||||
MultiLinearInterpolV2FastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<
|
||||
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
|
||||
BitpackedFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
|
||||
// let best_estimation_codec = results
|
||||
@@ -33,7 +63,7 @@ fn main() {
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
for (is_applicable, est, comp, name, compression_duration, read_duration) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
@@ -49,6 +79,8 @@ fn main() {
|
||||
Cell::new(name).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
Cell::new(&compression_duration.as_micros().to_string()),
|
||||
Cell::new(&read_duration.as_micros().to_string()),
|
||||
]));
|
||||
}
|
||||
}
|
||||
@@ -88,16 +120,56 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|_| rand::random::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Random"));
|
||||
|
||||
let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt");
|
||||
data_and_names.push((data.clone(), "HDFS logs timestamps"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "HDFS logs timestamps SORTED"));
|
||||
|
||||
let data = load_dataset("datasets/http_logs_timestamps.txt");
|
||||
data_and_names.push((data, "HTTP logs timestamps SORTED"));
|
||||
|
||||
let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt");
|
||||
data_and_names.push((data.clone(), "Amazon review product ids"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "Amazon review product ids SORTED"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
pub fn load_dataset(file_path: &str) -> Vec<u64> {
|
||||
println!("Load dataset from `{}`", file_path);
|
||||
let file = File::open(file_path).expect("Error when opening file.");
|
||||
let lines = io::BufReader::new(file).lines();
|
||||
let mut data = Vec::new();
|
||||
for line in lines {
|
||||
let l = line.unwrap();
|
||||
data.push(l.parse::<u64>().unwrap());
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
) -> (bool, f32, f32, &'static str, Duration, Duration) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
return (
|
||||
false,
|
||||
0.0,
|
||||
0.0,
|
||||
S::NAME,
|
||||
Duration::from_secs(0),
|
||||
Duration::from_secs(0),
|
||||
);
|
||||
}
|
||||
let start_time_compression = Instant::now();
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
@@ -108,9 +180,23 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let elasped_time_compression = start_time_compression.elapsed();
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
let start_time_read = Instant::now();
|
||||
for doc in 0..data.len() {
|
||||
reader.get_u64(doc as u64, &out);
|
||||
}
|
||||
let elapsed_time_read = start_time_read.elapsed();
|
||||
(
|
||||
true,
|
||||
estimation,
|
||||
actual_compression,
|
||||
S::NAME,
|
||||
elasped_time_compression,
|
||||
elapsed_time_read,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
375
fastfield_codecs/src/multilinearinterpol_v2.rs
Normal file
375
fastfield_codecs/src/multilinearinterpol_v2.rs
Normal file
@@ -0,0 +1,375 @@
|
||||
/*!
|
||||
|
||||
MultiLinearInterpolV2 compressor uses blockwise linear interpolation to guess values and stores the difference between the actual value
|
||||
and the one given by the linear interpolation.
|
||||
This is done for every block of 512 values. For every block, our linear function can be expressed as
|
||||
`computed_value = slope * block_position + first_value + positive_offset`
|
||||
where:
|
||||
- `block_position` is the position inside of the block from 0 to 511
|
||||
- `first_value` is the first value on the block
|
||||
- `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is always positive.
|
||||
|
||||
21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits per element.
|
||||
|
||||
*/
|
||||
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::DeserializeFrom;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
const BLOCK_SIZE: u64 = 512;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct MultiLinearInterpolV2FastFieldReader {
|
||||
num_vals: u64,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
block_readers: Vec<BlockReader>,
|
||||
}
|
||||
|
||||
/// Block metadata needed to define the linear function `y = a.x + b`
|
||||
/// and to bitpack the difference between the real value and the
|
||||
/// the linear function computed value where:
|
||||
/// - `a` is the `slope`
|
||||
/// - `b` is the sum of the `first_value` in the block + an offset
|
||||
/// `positive_offset` which ensures that difference between the real
|
||||
/// value and the linear function computed value is always positive.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockMetadata {
|
||||
first_value: u64,
|
||||
positive_offset: u64,
|
||||
slope: f32,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockReader {
|
||||
metadata: BlockMetadata,
|
||||
start_offset: u64,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl BlockReader {
|
||||
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||
Self {
|
||||
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||
metadata,
|
||||
start_offset,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||
let diff = self.bit_unpacker.get(block_pos, &data[self.start_offset as usize..]);
|
||||
let computed_value = get_computed_value(self.metadata.first_value, block_pos, self.metadata.slope);
|
||||
(computed_value + diff) - self.metadata.positive_offset
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockMetadata {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.first_value.serialize(write)?;
|
||||
self.positive_offset.serialize(write)?;
|
||||
self.slope.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let constant = u64::deserialize(reader)?;
|
||||
let constant_positive_offset = u64::deserialize(reader)?;
|
||||
let slope = f32::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
Ok(Self {
|
||||
first_value: constant,
|
||||
positive_offset: constant_positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct MultiLinearInterpolV2Footer {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
block_metadatas: Vec<BlockMetadata>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for MultiLinearInterpolV2Footer {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.block_metadatas.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let footer = Self {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||
};
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for MultiLinearInterpolV2FastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = MultiLinearInterpolV2Footer::deserialize(&mut footer)?;
|
||||
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||
let mut current_data_offset = 0;
|
||||
for block_metadata in footer.block_metadatas.into_iter() {
|
||||
let num_bits = block_metadata.num_bits;
|
||||
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||
}
|
||||
Ok(Self {
|
||||
num_vals: footer.num_vals,
|
||||
min_value: footer.min_value,
|
||||
max_value: footer.max_value,
|
||||
block_readers
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let block_idx = (doc / BLOCK_SIZE) as usize;
|
||||
let block_pos = doc - (block_idx as u64) * BLOCK_SIZE;
|
||||
let block_reader = &self.block_readers[block_idx];
|
||||
block_reader.get_u64(block_pos, data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_computed_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct MultiLinearInterpolV2FastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolV2FastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpolV2";
|
||||
const ID: u8 = 4;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut data = data_iter.collect::<Vec<_>>();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut block_metadatas = Vec::new();
|
||||
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||
let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||
let slope = if block_num_vals == 1 {
|
||||
0f32
|
||||
} else {
|
||||
((block_values[block_values.len() - 1] as f64 - block_values[0] as f64) / (block_num_vals - 1) as f64) as f32
|
||||
};
|
||||
let first_value = block_values[0];
|
||||
let mut positive_offset = 0;
|
||||
let mut max_delta = 0;
|
||||
for (pos, ¤t_value) in block_values[1..]
|
||||
.iter()
|
||||
.enumerate()
|
||||
{
|
||||
let computed_value = get_computed_value(first_value, pos as u64 + 1, slope);
|
||||
if computed_value > current_value {
|
||||
positive_offset = positive_offset.max(computed_value - current_value);
|
||||
} else {
|
||||
max_delta = max_delta.max(current_value - computed_value);
|
||||
}
|
||||
}
|
||||
let num_bits = compute_num_bits(max_delta + positive_offset);
|
||||
for (pos, current_value) in block_values.iter().enumerate()
|
||||
{
|
||||
let computed_value = get_computed_value(first_value, pos as u64, slope);
|
||||
let diff = (current_value + positive_offset) - computed_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
block_metadatas.push(BlockMetadata {
|
||||
first_value,
|
||||
positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
});
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = MultiLinearInterpolV2Footer {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
block_metadatas,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 10 * BLOCK_SIZE {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64) / (stats.num_vals - 1) as f64) as f32;
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|&pos| {
|
||||
let calculated_value =
|
||||
get_computed_value(first_val_in_first_block, pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 21 * (stats.num_vals / BLOCK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
MultiLinearInterpolV2FastFieldSerializer,
|
||||
MultiLinearInterpolV2FastFieldReader,
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user