mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
Found via these commands:
codespell -L crate,ser,panting,beauti,hart,ue,atleast,childs,ond,pris,hel,mot
markdownlint *.md doc/src/*.md --disable MD013 MD025 MD033 MD001 MD024 MD036 MD041 MD003
428 lines
15 KiB
Rust
428 lines
15 KiB
Rust
//! MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the
|
|
//! offset, but in blocks of 512.
|
|
//!
|
|
//! With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 /
|
|
//! 512 = 0,45 bits per element. The additional space required per element in a block is the the
|
|
//! maximum deviation of the linear interpolation estimation function.
|
|
//!
|
|
//! E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
|
|
//!
|
|
//! Size per block:
|
|
//! Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
|
|
|
|
use std::io::{self, Read, Write};
|
|
use std::ops::Sub;
|
|
|
|
use common::{BinarySerializable, CountingWriter, DeserializeFrom};
|
|
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
|
|
|
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
|
|
|
const CHUNK_SIZE: u64 = 512;
|
|
|
|
/// Depending on the field type, a different
|
|
/// fast field is required.
|
|
#[derive(Clone)]
|
|
pub struct MultiLinearInterpolFastFieldReader {
|
|
pub footer: MultiLinearInterpolFooter,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Default)]
|
|
struct Function {
|
|
// The offset in the data is required, because we have different bit_widths per block
|
|
data_start_offset: u64,
|
|
// start_pos in the block will be CHUNK_SIZE * BLOCK_NUM
|
|
start_pos: u64,
|
|
// only used during serialization, 0 after deserialization
|
|
end_pos: u64,
|
|
// only used during serialization, 0 after deserialization
|
|
value_start_pos: u64,
|
|
// only used during serialization, 0 after deserialization
|
|
value_end_pos: u64,
|
|
slope: f32,
|
|
// The offset so that all values are positive when writing them
|
|
positive_val_offset: u64,
|
|
num_bits: u8,
|
|
bit_unpacker: BitUnpacker,
|
|
}
|
|
|
|
impl Function {
|
|
fn calc_slope(&mut self) {
|
|
let num_vals = self.end_pos - self.start_pos;
|
|
self.slope = get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
|
}
|
|
// split the interpolation into two function, change self and return the second split
|
|
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
|
|
let mut new_function = Function {
|
|
start_pos: split_pos,
|
|
end_pos: self.end_pos,
|
|
value_start_pos: split_pos_value,
|
|
value_end_pos: self.value_end_pos,
|
|
..Default::default()
|
|
};
|
|
new_function.calc_slope();
|
|
self.end_pos = split_pos;
|
|
self.value_end_pos = split_pos_value;
|
|
self.calc_slope();
|
|
new_function
|
|
}
|
|
}
|
|
|
|
impl BinarySerializable for Function {
|
|
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
|
self.data_start_offset.serialize(write)?;
|
|
self.value_start_pos.serialize(write)?;
|
|
self.positive_val_offset.serialize(write)?;
|
|
self.slope.serialize(write)?;
|
|
self.num_bits.serialize(write)?;
|
|
Ok(())
|
|
}
|
|
|
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Function> {
|
|
let data_start_offset = u64::deserialize(reader)?;
|
|
let value_start_pos = u64::deserialize(reader)?;
|
|
let offset = u64::deserialize(reader)?;
|
|
let slope = f32::deserialize(reader)?;
|
|
let num_bits = u8::deserialize(reader)?;
|
|
let interpolation = Function {
|
|
data_start_offset,
|
|
value_start_pos,
|
|
positive_val_offset: offset,
|
|
num_bits,
|
|
bit_unpacker: BitUnpacker::new(num_bits),
|
|
slope,
|
|
..Default::default()
|
|
};
|
|
|
|
Ok(interpolation)
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct MultiLinearInterpolFooter {
|
|
pub num_vals: u64,
|
|
pub min_value: u64,
|
|
pub max_value: u64,
|
|
interpolations: Vec<Function>,
|
|
}
|
|
|
|
impl BinarySerializable for MultiLinearInterpolFooter {
|
|
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
|
let mut out = vec![];
|
|
self.num_vals.serialize(&mut out)?;
|
|
self.min_value.serialize(&mut out)?;
|
|
self.max_value.serialize(&mut out)?;
|
|
self.interpolations.serialize(&mut out)?;
|
|
write.write_all(&out)?;
|
|
(out.len() as u32).serialize(write)?;
|
|
Ok(())
|
|
}
|
|
|
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<MultiLinearInterpolFooter> {
|
|
let mut footer = MultiLinearInterpolFooter {
|
|
num_vals: u64::deserialize(reader)?,
|
|
min_value: u64::deserialize(reader)?,
|
|
max_value: u64::deserialize(reader)?,
|
|
interpolations: Vec::<Function>::deserialize(reader)?,
|
|
};
|
|
for (num, interpol) in footer.interpolations.iter_mut().enumerate() {
|
|
interpol.start_pos = CHUNK_SIZE * num as u64;
|
|
}
|
|
Ok(footer)
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn get_interpolation_position(doc: u64) -> usize {
|
|
let index = doc / CHUNK_SIZE;
|
|
index as usize
|
|
}
|
|
|
|
#[inline]
|
|
fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Function {
|
|
&interpolations[get_interpolation_position(doc)]
|
|
}
|
|
|
|
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
|
/// Opens a fast field given a file.
|
|
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
|
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
|
|
|
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
|
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
|
|
|
|
Ok(MultiLinearInterpolFastFieldReader { footer })
|
|
}
|
|
|
|
#[inline]
|
|
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
|
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
|
let doc = doc - interpolation.start_pos;
|
|
let calculated_value =
|
|
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
|
let diff = interpolation
|
|
.bit_unpacker
|
|
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
|
(calculated_value + diff) - interpolation.positive_val_offset
|
|
}
|
|
|
|
#[inline]
|
|
fn min_value(&self) -> u64 {
|
|
self.footer.min_value
|
|
}
|
|
#[inline]
|
|
fn max_value(&self) -> u64 {
|
|
self.footer.max_value
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
|
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
|
}
|
|
|
|
#[inline]
|
|
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
|
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
|
}
|
|
|
|
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
|
pub struct MultiLinearInterpolFastFieldSerializer {}
|
|
|
|
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
|
const NAME: &'static str = "MultiLinearInterpol";
|
|
const ID: u8 = 3;
|
|
/// Creates a new fast field serializer.
|
|
fn serialize(
|
|
write: &mut impl Write,
|
|
fastfield_accessor: &dyn FastFieldDataAccess,
|
|
stats: FastFieldStats,
|
|
data_iter: impl Iterator<Item = u64>,
|
|
_data_iter1: impl Iterator<Item = u64>,
|
|
) -> io::Result<()> {
|
|
assert!(stats.min_value <= stats.max_value);
|
|
|
|
let first_val = fastfield_accessor.get_val(0);
|
|
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
|
|
|
let mut first_function = Function {
|
|
end_pos: stats.num_vals,
|
|
value_start_pos: first_val,
|
|
value_end_pos: last_val,
|
|
..Default::default()
|
|
};
|
|
first_function.calc_slope();
|
|
let mut interpolations = vec![first_function];
|
|
|
|
// Since we potentially apply multiple passes over the data, the data is cached.
|
|
// Multiple iteration can be expensive (merge with index sorting can add lot of overhead per
|
|
// iteration)
|
|
let data = data_iter.collect::<Vec<_>>();
|
|
|
|
//// let's split this into chunks of CHUNK_SIZE
|
|
for data_pos in (0..data.len() as u64).step_by(CHUNK_SIZE as usize).skip(1) {
|
|
let new_fun = {
|
|
let current_interpolation = interpolations.last_mut().unwrap();
|
|
current_interpolation.split(data_pos, data[data_pos as usize])
|
|
};
|
|
interpolations.push(new_fun);
|
|
}
|
|
// calculate offset and max (-> numbits) for each function
|
|
for interpolation in &mut interpolations {
|
|
let mut offset = 0;
|
|
let mut rel_positive_max = 0;
|
|
for (pos, actual_value) in data
|
|
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
|
.iter()
|
|
.cloned()
|
|
.enumerate()
|
|
{
|
|
let calculated_value = get_calculated_value(
|
|
interpolation.value_start_pos,
|
|
pos as u64,
|
|
interpolation.slope,
|
|
);
|
|
if calculated_value > actual_value {
|
|
// negative value we need to apply an offset
|
|
// we ignore negative values in the max value calculation, because negative
|
|
// values will be offset to 0
|
|
offset = offset.max(calculated_value - actual_value);
|
|
} else {
|
|
// positive value no offset reuqired
|
|
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
|
}
|
|
}
|
|
|
|
interpolation.positive_val_offset = offset;
|
|
interpolation.num_bits = compute_num_bits(rel_positive_max + offset);
|
|
}
|
|
let mut bit_packer = BitPacker::new();
|
|
|
|
let write = &mut CountingWriter::wrap(write);
|
|
for interpolation in &mut interpolations {
|
|
interpolation.data_start_offset = write.written_bytes();
|
|
let num_bits = interpolation.num_bits;
|
|
for (pos, actual_value) in data
|
|
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
|
.iter()
|
|
.cloned()
|
|
.enumerate()
|
|
{
|
|
let calculated_value = get_calculated_value(
|
|
interpolation.value_start_pos,
|
|
pos as u64,
|
|
interpolation.slope,
|
|
);
|
|
let diff = (actual_value + interpolation.positive_val_offset) - calculated_value;
|
|
bit_packer.write(diff, num_bits, write)?;
|
|
}
|
|
bit_packer.flush(write)?;
|
|
}
|
|
bit_packer.close(write)?;
|
|
|
|
let footer = MultiLinearInterpolFooter {
|
|
num_vals: stats.num_vals,
|
|
min_value: stats.min_value,
|
|
max_value: stats.max_value,
|
|
interpolations,
|
|
};
|
|
footer.serialize(write)?;
|
|
Ok(())
|
|
}
|
|
|
|
fn is_applicable(
|
|
_fastfield_accessor: &impl FastFieldDataAccess,
|
|
stats: FastFieldStats,
|
|
) -> bool {
|
|
if stats.num_vals < 5_000 {
|
|
return false;
|
|
}
|
|
// On serialization the offset is added to the actual value.
|
|
// We need to make sure this won't run into overflow calculation issues.
|
|
// For this we take the maximum theroretical offset and add this to the max value.
|
|
// If this doesn't overflow the algorithm should be fine
|
|
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
|
if stats
|
|
.max_value
|
|
.checked_add(theorethical_maximum_offset)
|
|
.is_none()
|
|
{
|
|
return false;
|
|
}
|
|
true
|
|
}
|
|
/// estimation for linear interpolation is hard because, you don't know
|
|
/// where the local maxima are for the deviation of the calculated value and
|
|
/// the offset is also unknown.
|
|
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
|
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
|
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
|
let last_val_in_first_block =
|
|
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
|
let slope = get_slope(
|
|
first_val_in_first_block,
|
|
last_val_in_first_block,
|
|
stats.num_vals,
|
|
);
|
|
|
|
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
|
let sample_positions = (0..20)
|
|
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
|
.collect::<Vec<_>>();
|
|
|
|
let max_distance = sample_positions
|
|
.iter()
|
|
.map(|pos| {
|
|
let calculated_value =
|
|
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
|
|
let actual_value = fastfield_accessor.get_val(*pos as u64);
|
|
distance(calculated_value, actual_value)
|
|
})
|
|
.max()
|
|
.unwrap();
|
|
|
|
// Estimate one block and extrapolate the cost to all blocks.
|
|
// the theory would be that we don't have the actual max_distance, but we are close within
|
|
// 50% threshold.
|
|
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
|
// below. So the offset would = max_distance
|
|
//
|
|
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
|
|
|
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
|
// function metadata per block
|
|
+ 29 * (stats.num_vals / CHUNK_SIZE);
|
|
let num_bits_uncompressed = 64 * stats.num_vals;
|
|
num_bits as f32 / num_bits_uncompressed as f32
|
|
}
|
|
}
|
|
|
|
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
|
if x < y {
|
|
y - x
|
|
} else {
|
|
x - y
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::tests::get_codec_test_data_sets;
|
|
|
|
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
|
crate::tests::create_and_validate::<
|
|
MultiLinearInterpolFastFieldSerializer,
|
|
MultiLinearInterpolFastFieldReader,
|
|
>(data, name)
|
|
}
|
|
|
|
#[test]
|
|
fn test_compression() {
|
|
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
|
let (estimate, actual_compression) =
|
|
create_and_validate(&data, "simple monotonically large");
|
|
assert!(actual_compression < 0.2);
|
|
assert!(estimate < 0.20);
|
|
assert!(estimate > 0.15);
|
|
assert!(actual_compression > 0.01);
|
|
}
|
|
|
|
#[test]
|
|
fn test_with_codec_data_sets() {
|
|
let data_sets = get_codec_test_data_sets();
|
|
for (mut data, name) in data_sets {
|
|
create_and_validate(&data, name);
|
|
data.reverse();
|
|
create_and_validate(&data, name);
|
|
}
|
|
}
|
|
#[test]
|
|
fn test_simple() {
|
|
let data = (10..=20_u64).collect::<Vec<_>>();
|
|
create_and_validate(&data, "simple monotonically");
|
|
}
|
|
|
|
#[test]
|
|
fn border_cases_1() {
|
|
let data = (0..1024).collect::<Vec<_>>();
|
|
create_and_validate(&data, "border case");
|
|
}
|
|
#[test]
|
|
fn border_case_2() {
|
|
let data = (0..1025).collect::<Vec<_>>();
|
|
create_and_validate(&data, "border case");
|
|
}
|
|
#[test]
|
|
fn rand() {
|
|
for _ in 0..10 {
|
|
let mut data = (5_000..20_000)
|
|
.map(|_| rand::random::<u32>() as u64)
|
|
.collect::<Vec<_>>();
|
|
let _ = create_and_validate(&data, "random");
|
|
data.reverse();
|
|
create_and_validate(&data, "random");
|
|
}
|
|
}
|
|
}
|