diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index 0626d4789..7f4f453a2 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -11,7 +11,15 @@ description = "Fast field codecs used by tantivy" [dependencies] common = { path = "../common/" } tantivy-bitpacker = { path = "../bitpacker/" } +prettytable-rs = {version="0.8.0", optional= true} +#prettytable-rs = {version="0.8.0" } +rand = "0.8.3" [dev-dependencies] more-asserts = "0.2.1" rand = "0.8.3" + +[features] +bin = ["prettytable-rs"] +#default = ["bin"] + diff --git a/fastfield_codecs/README.md b/fastfield_codecs/README.md index 3dfe08a36..8f292daba 100644 --- a/fastfield_codecs/README.md +++ b/fastfield_codecs/README.md @@ -8,10 +8,55 @@ This crate contains various fast field codecs, used to compress/decompress fast Contributing is pretty straightforward. Since the bitpacking is the simplest compressor, you can check it for reference. -A codec needs to implement 3 parts: +A codec needs to implement 2 traits: -A reader implementing `CodecReader` to read the codec. -A serializer implementing `FastFieldSerializerEstimate` for compression estimation. -`CodecId`, to identify the codec. +A reader implementing `FastFieldCodecReader` to read the codec. +A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id. + +Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`). + +Make sure to add the codec to the main.rs, which tests the compression ratio and estimation against different data sets. You can run it with: +``` +cargo run --features bin +``` +Example Result +``` ++----------------------------------+-------------------+------------------------+ +| | Compression Ratio | Compression Estimation | ++----------------------------------+-------------------+------------------------+ +| Autoincrement | | | ++----------------------------------+-------------------+------------------------+ +| LinearInterpol | 0.000039572664 | 0.000004396963 | ++----------------------------------+-------------------+------------------------+ +| MultiLinearInterpol | 0.1477348 | 0.17275847 | ++----------------------------------+-------------------+------------------------+ +| Bitpacked | 0.28126493 | 0.28125 | ++----------------------------------+-------------------+------------------------+ +| Monotonically increasing concave | | | ++----------------------------------+-------------------+------------------------+ +| LinearInterpol | 0.25003937 | 0.26562938 | ++----------------------------------+-------------------+------------------------+ +| MultiLinearInterpol | 0.190665 | 0.1883836 | ++----------------------------------+-------------------+------------------------+ +| Bitpacked | 0.31251436 | 0.3125 | ++----------------------------------+-------------------+------------------------+ +| Monotonically increasing convex | | | ++----------------------------------+-------------------+------------------------+ +| LinearInterpol | 0.25003937 | 0.28125438 | ++----------------------------------+-------------------+------------------------+ +| MultiLinearInterpol | 0.18676 | 0.2040086 | ++----------------------------------+-------------------+------------------------+ +| Bitpacked | 0.31251436 | 0.3125 | ++----------------------------------+-------------------+------------------------+ +| Almost monotonically increasing | | | ++----------------------------------+-------------------+------------------------+ +| LinearInterpol | 0.14066513 | 0.1562544 | ++----------------------------------+-------------------+------------------------+ +| MultiLinearInterpol | 0.16335973 | 0.17275847 | ++----------------------------------+-------------------+------------------------+ +| Bitpacked | 0.28126493 | 0.28125 | ++----------------------------------+-------------------+------------------------+ + +``` diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 68b9af9ac..0dfad1e7c 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -8,7 +8,7 @@ mod tests { bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}, linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer}, multilinearinterpol::{ - MultiLinearInterpolFastFieldSerializer, MultiLinearinterpolFastFieldReader, + MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, }, *, }; @@ -92,7 +92,7 @@ mod tests { #[bench] fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::( + bench_get::( b, &data, ); } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs new file mode 100644 index 000000000..ae759468e --- /dev/null +++ b/fastfield_codecs/src/main.rs @@ -0,0 +1,123 @@ +#[macro_use] +extern crate prettytable; +use fastfield_codecs::{ + linearinterpol::LinearInterpolFastFieldSerializer, + multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer, + FastFieldStats, +}; +use prettytable::{Cell, Row, Table}; + +fn main() { + let mut table = Table::new(); + + // Add a row per time + table.add_row(row!["", "Compression Ratio", "Compression Estimation"]); + + for (data, data_set_name) in get_codec_test_data_sets() { + let mut results = vec![]; + let res = serialize_with_codec::(&data); + results.push(res); + let res = serialize_with_codec::(&data); + results.push(res); + let res = serialize_with_codec::( + &data, + ); + results.push(res); + + //let best_estimation_codec = results + //.iter() + //.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap()) + //.unwrap(); + let best_compression_ratio_codec = results + .iter() + .min_by(|res1, res2| res1.partial_cmp(&res2).unwrap()) + .cloned() + .unwrap(); + + table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")])); + for (est, comp, name) in results { + let (est_cell, ratio_cell) = if est == f32::MAX { + ("Codec Disabled".to_string(), "".to_string()) + } else { + (est.to_string(), comp.to_string()) + }; + let style = if comp == best_compression_ratio_codec.1 { + "Fb" + } else { + "" + }; + + table.add_row(Row::new(vec![ + Cell::new(&name.to_string()).style_spec("bFg"), + Cell::new(&ratio_cell).style_spec(style), + Cell::new(&est_cell).style_spec(""), + ])); + } + } + + table.printstd(); +} + +pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { + let mut data_and_names = vec![]; + + let data = (1000..=200_000_u64).collect::>(); + data_and_names.push((data, "Autoincrement")); + + let mut current_cumulative = 0; + let data = (1..=200_000_u64) + .map(|num| { + let num = (num as f32 + num as f32).log10() as u64; + current_cumulative += num; + current_cumulative + }) + .collect::>(); + //let data = (1..=200000_u64).map(|num| num + num).collect::>(); + data_and_names.push((data, "Monotonically increasing concave")); + + let mut current_cumulative = 0; + let data = (1..=200_000_u64) + .map(|num| { + let num = (200_000.0 - num as f32).log10() as u64; + current_cumulative += num; + current_cumulative + }) + .collect::>(); + data_and_names.push((data, "Monotonically increasing convex")); + + let data = (1000..=200_000_u64) + .map(|num| num + rand::random::() as u64) + .collect::>(); + data_and_names.push((data, "Almost monotonically increasing")); + + data_and_names +} + +pub fn serialize_with_codec(data: &[u64]) -> (f32, f32, &'static str) { + let estimation = S::estimate(&data, stats_from_vec(&data)); + if estimation == f32::MAX { + return (estimation, 0.0, S::NAME); + } + let mut out = vec![]; + S::create( + &mut out, + &data, + stats_from_vec(&data), + data.iter().cloned(), + data.iter().cloned(), + ) + .unwrap(); + + let actual_compression = out.len() as f32 / (data.len() * 8) as f32; + return (estimation, actual_compression, S::NAME); +} + +pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { + let min_value = data.iter().cloned().min().unwrap_or(0); + let max_value = data.iter().cloned().max().unwrap_or(0); + FastFieldStats { + min_value, + max_value, + num_vals: data.len() as u64, + } +} diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 525a8869f..70942d190 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -12,8 +12,8 @@ use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader; use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader; use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; +use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader; use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; -use fastfield_codecs::multilinearinterpol::MultiLinearinterpolFastFieldReader; use fastfield_codecs::FastFieldCodecReader; use fastfield_codecs::FastFieldCodecSerializer; use std::collections::HashMap; @@ -71,7 +71,7 @@ pub enum DynamicFastFieldReader { /// Linear interpolated values + bitpacked LinearInterpol(FastFieldReaderCodecWrapper), /// Blockwise linear interpolated values + bitpacked - MultiLinearInterpol(FastFieldReaderCodecWrapper), + MultiLinearInterpol(FastFieldReaderCodecWrapper), } impl DynamicFastFieldReader { @@ -96,7 +96,7 @@ impl DynamicFastFieldReader { MultiLinearInterpolFastFieldSerializer::ID => { DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::< Item, - MultiLinearinterpolFastFieldReader, + MultiLinearInterpolFastFieldReader, >::open_from_bytes( bytes )?) diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index e1644920b..6d4f0455b 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -3,8 +3,6 @@ use crate::common::CompositeWrite; use crate::common::CountingWriter; use crate::directory::WritePtr; use crate::schema::Field; -use fastfield_codecs::CodecId; -//pub use bitpacked::BitpackedFastFieldSerializer; pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy; use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;