mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
add linear interpolation estimation
add estimation tests add codec test data in tests
This commit is contained in:
@@ -11,4 +11,5 @@ common = { path = "../common/" }
|
||||
tantivy-bitpacker = { path = "../bitpacker/" }
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.2.1"
|
||||
rand = "0.8.3"
|
||||
|
||||
@@ -131,8 +131,8 @@ impl<'a, W: 'a + Write> CodecId for BitpackedFastFieldSerializer<'_, W> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_and_validate(data: &[u64]) {
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
let mut out = vec![];
|
||||
BitpackedFastFieldSerializer::create(
|
||||
&mut out,
|
||||
@@ -143,45 +143,37 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let reader = BitpackedFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, val) in data.iter().enumerate() {
|
||||
assert_eq!(reader.get_u64(doc as u64, &out), *val);
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}",
|
||||
val, orig_val, name
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bitpacked_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
create_and_validate(&data);
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bitpacked_fast_field_test_with_offset() {
|
||||
//let data = vec![5, 50, 95, 96, 97, 98, 99, 100];
|
||||
let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn bitpacked_fast_field_test_no_structure() {
|
||||
let mut data = vec![5, 50, 3, 13, 1, 1000, 35];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn bitpacked_fast_field_rand() {
|
||||
for _ in 0..500 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data);
|
||||
create_and_validate(&data, "rand");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
create_and_validate(&data, "rand");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,20 +1,10 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate more_asserts;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
@@ -31,6 +21,10 @@ pub trait FastFieldDataAccess: Clone {
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldSerializerEstimate {
|
||||
/// returns an estimate of the compression ratio.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
@@ -64,3 +58,78 @@ impl FastFieldDataAccess for Vec<u64> {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
bitpacked::BitpackedFastFieldSerializer, linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "simple monotonically increasing"));
|
||||
|
||||
data_and_names.push((
|
||||
vec![5, 6, 7, 8, 9, 10, 99, 100],
|
||||
"offset in linear interpol",
|
||||
));
|
||||
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
|
||||
data_and_names.push((vec![10], "single value"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
use super::*;
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation.0, 0.1);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation.0, bitpacked_estimation.0);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation.0, 0.3);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation.0, linear_interpol_estimation.0);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation.0, 0.35);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation.0, 0.32);
|
||||
assert_le!(bitpacked_estimation.0, linear_interpol_estimation.0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
@@ -58,11 +59,6 @@ impl LinearinterpolFastFieldReader {
|
||||
pub fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
|
||||
//let rel_max_value = u64::deserialize(&mut footer)?;
|
||||
//let offset = u64::deserialize(&mut footer)?;
|
||||
//let first_value = u64::deserialize(&mut footer)?;
|
||||
//let last_value = u64::deserialize(&mut footer)?;
|
||||
//let num_vals = u64::deserialize(&mut footer)?;
|
||||
let slope = (footer.last_val as f64 - footer.first_val as f64)
|
||||
/ (footer.num_vals as u64 - 1) as f64;
|
||||
|
||||
@@ -75,7 +71,7 @@ impl LinearinterpolFastFieldReader {
|
||||
})
|
||||
}
|
||||
pub fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = self.footer.first_val + (doc as f64 * self.slope) as u64;
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset
|
||||
}
|
||||
}
|
||||
@@ -94,47 +90,43 @@ impl LinearInterpolFastFieldSerializer {
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
data_iter2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
// todo walk over data just once and calulate offset on the fly
|
||||
// todo walk over data just once and calulate offset and max on the fly
|
||||
// offset to ensure all values are positive
|
||||
let offset = data_iter1
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
val as i64 - calculated_value as i64
|
||||
})
|
||||
.min()
|
||||
.unwrap()
|
||||
.abs() as u64;
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data_iter1.enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
|
||||
//calc new max
|
||||
let rel_max = data_iter2
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
(val + offset) - calculated_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
// rel_positive_max will be adjusted by offset
|
||||
let relative_max_value = rel_positive_max + offset;
|
||||
|
||||
let amplitude = rel_max;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits = compute_num_bits(relative_max_value);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = LinearInterpolFooter {
|
||||
relative_max_value: amplitude,
|
||||
relative_max_value,
|
||||
offset,
|
||||
first_val,
|
||||
last_val,
|
||||
@@ -148,20 +140,66 @@ fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f64 {
|
||||
(last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64
|
||||
}
|
||||
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f64) -> u64 {
|
||||
first_val + (pos as f64 * slope) as u64
|
||||
}
|
||||
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
|
||||
// let's sample at 10%, 20%, 25%, 50%, 75%, 90%
|
||||
let num_vals = stats.num_vals as f32 / 100.0;
|
||||
let sample_positions = [
|
||||
(num_vals * 10.0) as usize,
|
||||
(num_vals * 20.0) as usize,
|
||||
(num_vals * 25.0) as usize,
|
||||
(num_vals * 50.0) as usize,
|
||||
(num_vals * 75.0) as usize,
|
||||
(num_vals * 90.0) as usize,
|
||||
];
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|pos| {
|
||||
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get(*pos as u32);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::NAME;
|
||||
(ratio, name)
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
impl CodecId for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
@@ -170,8 +208,9 @@ impl CodecId for LinearInterpolFastFieldSerializer {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64]) -> (u64, u64) {
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (u64, u64) {
|
||||
let mut out = vec![];
|
||||
LinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
@@ -179,54 +218,52 @@ mod tests {
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, val) in data.iter().enumerate() {
|
||||
assert_eq!(reader.get_u64(doc as u64, &out), *val);
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
//assert_eq!(reader.get_u64(doc as u64, &out), *val);
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}",
|
||||
val, orig_val, name
|
||||
);
|
||||
}
|
||||
}
|
||||
(reader.footer.relative_max_value, reader.footer.offset)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
let (rel_max_value, offset) = create_and_validate(&data);
|
||||
let (rel_max_value, offset) = create_and_validate(&data, "simple monotonically");
|
||||
|
||||
assert_eq!(offset, 0);
|
||||
assert_eq!(rel_max_value, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_with_offset() {
|
||||
//let data = vec![5, 50, 95, 96, 97, 98, 99, 100];
|
||||
let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_no_structure() {
|
||||
let mut data = vec![5, 50, 3, 13, 1, 1000, 35];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..500 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data);
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user