add contributing guidelines, add codec comparer binary

add contributing guidelines
add codec comparer binary to test codec compressions with different test data sets
This commit is contained in:
Pascal Seitz
2021-06-14 13:56:23 +02:00
parent 1d41b96d32
commit abb5624af2
6 changed files with 185 additions and 11 deletions

View File

@@ -11,7 +11,15 @@ description = "Fast field codecs used by tantivy"
[dependencies]
common = { path = "../common/" }
tantivy-bitpacker = { path = "../bitpacker/" }
prettytable-rs = {version="0.8.0", optional= true}
#prettytable-rs = {version="0.8.0" }
rand = "0.8.3"
[dev-dependencies]
more-asserts = "0.2.1"
rand = "0.8.3"
[features]
bin = ["prettytable-rs"]
#default = ["bin"]

View File

@@ -8,10 +8,55 @@ This crate contains various fast field codecs, used to compress/decompress fast
Contributing is pretty straightforward. Since the bitpacking is the simplest compressor, you can check it for reference.
A codec needs to implement 3 parts:
A codec needs to implement 2 traits:
A reader implementing `CodecReader` to read the codec.
A serializer implementing `FastFieldSerializerEstimate` for compression estimation.
`CodecId`, to identify the codec.
A reader implementing `FastFieldCodecReader` to read the codec.
A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
Make sure to add the codec to the main.rs, which tests the compression ratio and estimation against different data sets. You can run it with:
```
cargo run --features bin
```
Example Result
```
+----------------------------------+-------------------+------------------------+
| | Compression Ratio | Compression Estimation |
+----------------------------------+-------------------+------------------------+
| Autoincrement | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.000039572664 | 0.000004396963 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing concave | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.26562938 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.190665 | 0.1883836 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing convex | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.28125438 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.18676 | 0.2040086 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Almost monotonically increasing | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.14066513 | 0.1562544 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
```

View File

@@ -8,7 +8,7 @@ mod tests {
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
multilinearinterpol::{
MultiLinearInterpolFastFieldSerializer, MultiLinearinterpolFastFieldReader,
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
},
*,
};
@@ -92,7 +92,7 @@ mod tests {
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearinterpolFastFieldReader>(
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
b, &data,
);
}

View File

@@ -0,0 +1,123 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::{
linearinterpol::LinearInterpolFastFieldSerializer,
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
FastFieldStats,
};
use prettytable::{Cell, Row, Table};
fn main() {
let mut table = Table::new();
// Add a row per time
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![];
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
results.push(res);
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
results.push(res);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
&data,
);
results.push(res);
//let best_estimation_codec = results
//.iter()
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
//.unwrap();
let best_compression_ratio_codec = results
.iter()
.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
.cloned()
.unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (est, comp, name) in results {
let (est_cell, ratio_cell) = if est == f32::MAX {
("Codec Disabled".to_string(), "".to_string())
} else {
(est.to_string(), comp.to_string())
};
let style = if comp == best_compression_ratio_codec.1 {
"Fb"
} else {
""
};
table.add_row(Row::new(vec![
Cell::new(&name.to_string()).style_spec("bFg"),
Cell::new(&ratio_cell).style_spec(style),
Cell::new(&est_cell).style_spec(""),
]));
}
}
table.printstd();
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];
let data = (1000..=200_000_u64).collect::<Vec<_>>();
data_and_names.push((data, "Autoincrement"));
let mut current_cumulative = 0;
let data = (1..=200_000_u64)
.map(|num| {
let num = (num as f32 + num as f32).log10() as u64;
current_cumulative += num;
current_cumulative
})
.collect::<Vec<_>>();
//let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing concave"));
let mut current_cumulative = 0;
let data = (1..=200_000_u64)
.map(|num| {
let num = (200_000.0 - num as f32).log10() as u64;
current_cumulative += num;
current_cumulative
})
.collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing convex"));
let data = (1000..=200_000_u64)
.map(|num| num + rand::random::<u8>() as u64)
.collect::<Vec<_>>();
data_and_names.push((data, "Almost monotonically increasing"));
data_and_names
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(data: &[u64]) -> (f32, f32, &'static str) {
let estimation = S::estimate(&data, stats_from_vec(&data));
if estimation == f32::MAX {
return (estimation, 0.0, S::NAME);
}
let mut out = vec![];
S::create(
&mut out,
&data,
stats_from_vec(&data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
return (estimation, actual_compression, S::NAME);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
let max_value = data.iter().cloned().max().unwrap_or(0);
FastFieldStats {
min_value,
max_value,
num_vals: data.len() as u64,
}
}

View File

@@ -12,8 +12,8 @@ use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearinterpolFastFieldReader;
use fastfield_codecs::FastFieldCodecReader;
use fastfield_codecs::FastFieldCodecSerializer;
use std::collections::HashMap;
@@ -71,7 +71,7 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
/// Linear interpolated values + bitpacked
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
/// Blockwise linear interpolated values + bitpacked
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearinterpolFastFieldReader>),
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
@@ -96,7 +96,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
MultiLinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
Item,
MultiLinearinterpolFastFieldReader,
MultiLinearInterpolFastFieldReader,
>::open_from_bytes(
bytes
)?)

View File

@@ -3,8 +3,6 @@ use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::WritePtr;
use crate::schema::Field;
use fastfield_codecs::CodecId;
//pub use bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;