add is_applicable to fast field codecs

This commit is contained in:
Pascal Seitz
2021-06-14 16:16:25 +02:00
parent 0a534c6ee0
commit c889ae10e4
6 changed files with 80 additions and 53 deletions

View File

@@ -126,6 +126,12 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
Ok(())
}
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
_stats: FastFieldStats,
) -> bool {
true
}
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);

View File

@@ -27,8 +27,10 @@ pub trait FastFieldCodecSerializer {
const NAME: &'static str;
const ID: u8;
/// Returns an estimate of the compression ratio. if the compressor is unable to handle the
/// data it needs to return f32::MAX.
/// Check if the Codec is able to compress the data
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
/// Returns an estimate of the compression ratio.
/// The baseline is uncompressed 64bit data.
///
/// It could make sense to also return a value representing
@@ -92,10 +94,10 @@ mod tests {
data: &[u64],
name: &str,
) -> (f32, f32) {
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
if estimation == f32::MAX {
return (estimation, 0.0);
if !S::is_applicable(&data, crate::tests::stats_from_vec(&data)) {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
let mut out = vec![];
S::serialize(
&mut out,

View File

@@ -166,12 +166,12 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
footer.serialize(write)?;
Ok(())
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> bool {
if stats.num_vals < 3 {
return f32::MAX; //disable compressor for this case
return false; //disable compressor for this case
}
// On serialisation the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
@@ -183,9 +183,14 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
.checked_add(theorethical_maximum_offset)
.is_none()
{
return f32::MAX;
return false;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);

View File

@@ -35,8 +35,8 @@ fn main() {
.unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (est, comp, name) in results {
let (est_cell, ratio_cell) = if est == f32::MAX {
for (is_applicable, est, comp, name) in results {
let (est_cell, ratio_cell) = if !is_applicable {
("Codec Disabled".to_string(), "".to_string())
} else {
(est.to_string(), comp.to_string())
@@ -93,11 +93,14 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
data_and_names
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(data: &[u64]) -> (f32, f32, &'static str) {
let estimation = S::estimate(&data, stats_from_vec(&data));
if estimation == f32::MAX {
return (estimation, 0.0, S::NAME);
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
data: &[u64],
) -> (bool, f32, f32, &'static str) {
let is_applicable = S::is_applicable(&data, stats_from_vec(&data));
if !is_applicable {
return (false, 0.0, 0.0, S::NAME);
}
let estimation = S::estimate(&data, stats_from_vec(&data));
let mut out = vec![];
S::serialize(
&mut out,
@@ -109,7 +112,7 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(data: &[u64]) -> (f32,
.unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
return (estimation, actual_compression, S::NAME);
return (true, estimation, actual_compression, S::NAME);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -310,12 +310,12 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
Ok(())
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> bool {
if stats.num_vals < 5_000 {
return f32::MAX;
return false;
}
// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
@@ -327,9 +327,14 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
.checked_add(theorethical_maximum_offset)
.is_none()
{
return f32::MAX;
return false;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val_in_first_block = fastfield_accessor.get(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);

View File

@@ -36,6 +36,24 @@ pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
}
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
stats: FastFieldStats,
fastfield_accessor: &A,
estimations: &mut Vec<(f32, &str, u8)>,
) {
if !T::is_applicable(fastfield_accessor, stats.clone()) {
return;
}
let (ratio, name, id) = (
T::estimate(fastfield_accessor, stats.clone()),
T::NAME,
T::ID,
);
estimations.push((ratio, name, id));
}
impl CompositeFastFieldSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
@@ -57,33 +75,21 @@ impl CompositeFastFieldSerializer {
let mut estimations = vec![];
{
let (ratio, name, id) = (
BitpackedFastFieldSerializer::estimate(&fastfield_accessor, stats.clone()),
BitpackedFastFieldSerializer::NAME,
BitpackedFastFieldSerializer::ID,
);
estimations.push((ratio, name, id));
}
{
let (ratio, name, id) = (
LinearInterpolFastFieldSerializer::estimate(&fastfield_accessor, stats.clone()),
LinearInterpolFastFieldSerializer::NAME,
LinearInterpolFastFieldSerializer::ID,
);
estimations.push((ratio, name, id));
}
{
let (ratio, name, id) = (
MultiLinearInterpolFastFieldSerializer::estimate(
&fastfield_accessor,
stats.clone(),
),
MultiLinearInterpolFastFieldSerializer::NAME,
MultiLinearInterpolFastFieldSerializer::ID,
);
estimations.push((ratio, name, id));
}
codec_estimation::<BitpackedFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
if let Some(broken_estimation) = estimations
.iter()
.find(|estimation| estimation.0 == f32::NAN)