mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
Small refactoring estimate.
This commit is contained in:
@@ -141,21 +141,19 @@ impl FastFieldCodec for BitpackedCodec {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
|
||||
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<BitpackedCodec>(data, name);
|
||||
@@ -163,7 +161,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
|
||||
@@ -289,10 +289,14 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
if fastfield_accessor.num_vals() < 5_000 {
|
||||
return false;
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
|
||||
return None;
|
||||
}
|
||||
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
@@ -304,14 +308,9 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
return None;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
|
||||
let last_val_in_first_block =
|
||||
@@ -350,7 +349,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
// function metadata per block
|
||||
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -365,10 +364,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<BlockwiseLinearCodec, BlockwiseLinearReader>(data, name)
|
||||
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
|
||||
crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name)
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
@@ -382,7 +381,7 @@ mod tests {
|
||||
.map(i64_to_u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large i64");
|
||||
create_and_validate(&data, "simple monotonically large i64").unwrap();
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
@@ -393,7 +392,7 @@ mod tests {
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
create_and_validate(&data, "simple monotonically large").unwrap();
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
@@ -402,7 +401,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
|
||||
@@ -82,15 +82,14 @@ pub trait FastFieldCodec {
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// If the codec is not applicable, returns `None`.
|
||||
///
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32>;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -152,11 +151,12 @@ mod tests {
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
use crate::linear::LinearCodec;
|
||||
|
||||
pub fn create_and_validate<Codec: FastFieldCodec>(data: &[u64], name: &str) -> (f32, f32) {
|
||||
if !Codec::is_applicable(&data) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = Codec::estimate(&data);
|
||||
pub fn create_and_validate<Codec: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> Option<(f32, f32)> {
|
||||
let estimation = Codec::estimate(&data)?;
|
||||
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
Codec::serialize(&mut out, &data).unwrap();
|
||||
|
||||
@@ -164,16 +164,15 @@ mod tests {
|
||||
|
||||
let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
|
||||
assert_eq!(reader.num_vals(), data.len() as u64);
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
for (doc, orig_val) in data.iter().copied().enumerate() {
|
||||
let val = reader.get_val(doc as u64);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
|
||||
{data:?}",
|
||||
);
|
||||
}
|
||||
assert_eq!(
|
||||
val, orig_val,
|
||||
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
|
||||
`{data:?}`",
|
||||
);
|
||||
}
|
||||
(estimation, actual_compression)
|
||||
Some((estimation, actual_compression))
|
||||
}
|
||||
|
||||
proptest! {
|
||||
@@ -193,10 +192,10 @@ mod tests {
|
||||
|
||||
}
|
||||
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
let data = (10..=10_000_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "simple monotonically increasing"));
|
||||
|
||||
data_and_names.push((
|
||||
@@ -211,12 +210,13 @@ mod tests {
|
||||
|
||||
fn test_codec<C: FastFieldCodec>() {
|
||||
let codec_name = format!("{:?}", C::CODEC_TYPE);
|
||||
for (data, dataset_name) in get_codec_test_data_sets() {
|
||||
let (estimate, actual) = crate::tests::create_and_validate::<C>(&data, dataset_name);
|
||||
let result = if estimate == f32::MAX {
|
||||
"Disabled".to_string()
|
||||
} else {
|
||||
for (data, dataset_name) in get_codec_test_datasets() {
|
||||
let estimate_actual_opt: Option<(f32, f32)> =
|
||||
crate::tests::create_and_validate::<C>(&data, dataset_name);
|
||||
let result = if let Some((estimate, actual)) = estimate_actual_opt {
|
||||
format!("Estimate `{estimate}` Actual `{actual}`")
|
||||
} else {
|
||||
"Disabled".to_string()
|
||||
};
|
||||
println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
|
||||
}
|
||||
@@ -240,37 +240,37 @@ mod tests {
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data);
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data);
|
||||
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data);
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data);
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data);
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
let mut data: Vec<u64> = (200..=20000_u64).collect();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data);
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data);
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
|
||||
@@ -192,10 +192,15 @@ impl FastFieldCodec for LinearCodec {
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
return false; // disable compressor for this case
|
||||
return None; // disable compressor for this case
|
||||
}
|
||||
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
@@ -207,14 +212,9 @@ impl FastFieldCodec for LinearCodec {
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
return None;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
|
||||
@@ -246,7 +246,7 @@ impl FastFieldCodec for LinearCodec {
|
||||
* fastfield_accessor.num_vals()
|
||||
+ LinearFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -262,10 +262,10 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<LinearCodec, LinearReader>(data, name)
|
||||
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
|
||||
crate::tests::create_and_validate::<LinearCodec>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -292,15 +292,15 @@ mod tests {
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
create_and_validate(&data, "simple monotonically large").unwrap();
|
||||
|
||||
assert!(actual_compression < 0.01);
|
||||
assert!(estimate < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
fn test_with_codec_datasets() {
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
@@ -337,9 +337,10 @@ mod tests {
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
let mut data = (0..10_000)
|
||||
.map(|_| rand::random::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
@@ -12,37 +13,30 @@ fn main() {
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearCodec>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<BlockwiseLinearCodec>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedCodec>(&data);
|
||||
results.push(res);
|
||||
|
||||
// let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let results: Vec<(f32, f32, FastFieldCodecType)> = [
|
||||
serialize_with_codec::<LinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BitpackedCodec>(&data),
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
.min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, codec_type) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
(est.to_string(), comp.to_string())
|
||||
};
|
||||
for (est, comp, codec_type) in results {
|
||||
let est_cell = est.to_string();
|
||||
let ratio_cell = comp.to_string();
|
||||
let style = if comp == best_compression_ratio_codec.1 {
|
||||
"Fb"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
table.add_row(Row::new(vec![
|
||||
Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
@@ -91,17 +85,12 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
|
||||
pub fn serialize_with_codec<C: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, FastFieldCodecType) {
|
||||
let is_applicable = C::is_applicable(&data);
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, C::CODEC_TYPE);
|
||||
}
|
||||
let estimation = C::estimate(&data);
|
||||
let mut out = vec![];
|
||||
) -> Option<(f32, f32, FastFieldCodecType)> {
|
||||
let estimation = C::estimate(&data)?;
|
||||
let mut out = Vec::new();
|
||||
C::serialize(&mut out, &data).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, C::CODEC_TYPE)
|
||||
Some((estimation, actual_compression, C::CODEC_TYPE))
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -68,11 +68,9 @@ fn codec_estimation<C: FastFieldCodec>(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
estimations: &mut Vec<(f32, FastFieldCodecType)>,
|
||||
) {
|
||||
if !C::is_applicable(fastfield_accessor) {
|
||||
return;
|
||||
if let Some(ratio) = C::estimate(fastfield_accessor) {
|
||||
estimations.push((ratio, C::CODEC_TYPE));
|
||||
}
|
||||
let ratio = C::estimate(fastfield_accessor);
|
||||
estimations.push((ratio, C::CODEC_TYPE));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
|
||||
Reference in New Issue
Block a user