mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-24 12:10:41 +00:00
add is_applicable to fast field codecs
This commit is contained in:
@@ -126,6 +126,12 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
_stats: FastFieldStats,
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
|
||||
@@ -27,8 +27,10 @@ pub trait FastFieldCodecSerializer {
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
|
||||
/// Returns an estimate of the compression ratio. if the compressor is unable to handle the
|
||||
/// data it needs to return f32::MAX.
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
@@ -92,10 +94,10 @@ mod tests {
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> (f32, f32) {
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
|
||||
if estimation == f32::MAX {
|
||||
return (estimation, 0.0);
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(&data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
|
||||
@@ -166,12 +166,12 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 3 {
|
||||
return f32::MAX; //disable compressor for this case
|
||||
return false; //disable compressor for this case
|
||||
}
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
@@ -183,9 +183,14 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return f32::MAX;
|
||||
return false;
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
|
||||
@@ -35,8 +35,8 @@ fn main() {
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (est, comp, name) in results {
|
||||
let (est_cell, ratio_cell) = if est == f32::MAX {
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
(est.to_string(), comp.to_string())
|
||||
@@ -93,11 +93,14 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(data: &[u64]) -> (f32, f32, &'static str) {
|
||||
let estimation = S::estimate(&data, stats_from_vec(&data));
|
||||
if estimation == f32::MAX {
|
||||
return (estimation, 0.0, S::NAME);
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(&data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(&data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
@@ -109,7 +112,7 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(data: &[u64]) -> (f32,
|
||||
.unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
return (estimation, actual_compression, S::NAME);
|
||||
return (true, estimation, actual_compression, S::NAME);
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -310,12 +310,12 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 5_000 {
|
||||
return f32::MAX;
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
@@ -327,9 +327,14 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return f32::MAX;
|
||||
return false;
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);
|
||||
|
||||
@@ -36,6 +36,24 @@ pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
return;
|
||||
}
|
||||
let (ratio, name, id) = (
|
||||
T::estimate(fastfield_accessor, stats.clone()),
|
||||
T::NAME,
|
||||
T::ID,
|
||||
);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
@@ -57,33 +75,21 @@ impl CompositeFastFieldSerializer {
|
||||
|
||||
let mut estimations = vec![];
|
||||
|
||||
{
|
||||
let (ratio, name, id) = (
|
||||
BitpackedFastFieldSerializer::estimate(&fastfield_accessor, stats.clone()),
|
||||
BitpackedFastFieldSerializer::NAME,
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
{
|
||||
let (ratio, name, id) = (
|
||||
LinearInterpolFastFieldSerializer::estimate(&fastfield_accessor, stats.clone()),
|
||||
LinearInterpolFastFieldSerializer::NAME,
|
||||
LinearInterpolFastFieldSerializer::ID,
|
||||
);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
{
|
||||
let (ratio, name, id) = (
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(
|
||||
&fastfield_accessor,
|
||||
stats.clone(),
|
||||
),
|
||||
MultiLinearInterpolFastFieldSerializer::NAME,
|
||||
MultiLinearInterpolFastFieldSerializer::ID,
|
||||
);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
if let Some(broken_estimation) = estimations
|
||||
.iter()
|
||||
.find(|estimation| estimation.0 == f32::NAN)
|
||||
|
||||
Reference in New Issue
Block a user