mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
streamline traits and tests
This commit is contained in:
@@ -6,7 +6,7 @@ extern crate test;
|
||||
mod tests {
|
||||
use fastfield_codecs::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldSerializer, LinearinterpolFastFieldReader},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldSerializer, MultiLinearinterpolFastFieldReader,
|
||||
},
|
||||
@@ -26,112 +26,75 @@ mod tests {
|
||||
data
|
||||
}
|
||||
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
0..20_000
|
||||
}
|
||||
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
b: &mut Bencher,
|
||||
data: &[u64],
|
||||
) {
|
||||
let mut bytes = vec![];
|
||||
S::create(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = R::open_from_bytes(&bytes).unwrap();
|
||||
b.iter(|| {
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
}
|
||||
});
|
||||
}
|
||||
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = vec![];
|
||||
b.iter(|| {
|
||||
S::create(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
use test::Bencher;
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
b.iter(|| {
|
||||
let mut out = vec![];
|
||||
BitpackedFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
out
|
||||
});
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
b.iter(|| {
|
||||
let mut out = vec![];
|
||||
LinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
out
|
||||
});
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
b.iter(|| {
|
||||
let mut out = vec![];
|
||||
MultiLinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
out
|
||||
});
|
||||
}
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
0..20_000
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
let mut bytes = vec![];
|
||||
BitpackedFastFieldSerializer::create(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = BitpackedFastFieldReader::open_from_bytes(&bytes).unwrap();
|
||||
b.iter(|| {
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
}
|
||||
});
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
let mut bytes = vec![];
|
||||
LinearInterpolFastFieldSerializer::create(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = LinearinterpolFastFieldReader::open_from_bytes(&bytes).unwrap();
|
||||
b.iter(|| {
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
}
|
||||
});
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
let mut bytes = vec![];
|
||||
MultiLinearInterpolFastFieldSerializer::create(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = MultiLinearinterpolFastFieldReader::open_from_bytes(&bytes).unwrap();
|
||||
b.iter(|| {
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
}
|
||||
});
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearinterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::CodecId;
|
||||
use crate::CodecReader;
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
@@ -19,7 +19,7 @@ pub struct BitpackedFastFieldReader {
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl<'data> CodecReader for BitpackedFastFieldReader {
|
||||
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
@@ -47,7 +47,7 @@ impl<'data> CodecReader for BitpackedFastFieldReader {
|
||||
self.max_value_u64
|
||||
}
|
||||
}
|
||||
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
|
||||
pub struct BitpackedFastFieldSerializerLegacy<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
@@ -55,7 +55,7 @@ pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
@@ -68,12 +68,12 @@ impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
let amplitude = max_value - min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializer {
|
||||
Ok(BitpackedFastFieldSerializerLegacy {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
@@ -81,29 +81,6 @@ impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn create(
|
||||
write: &'a mut W,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer = Self::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
#[inline]
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
@@ -120,7 +97,34 @@ impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
|
||||
pub struct BitpackedFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn create(
|
||||
write: &mut impl Write,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer =
|
||||
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
@@ -128,7 +132,7 @@ impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerial
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
impl<'a, W: 'a + Write> CodecId for BitpackedFastFieldSerializer<'_, W> {
|
||||
impl CodecId for BitpackedFastFieldSerializer {
|
||||
const NAME: &'static str = "Bitpacked";
|
||||
const ID: u8 = 1;
|
||||
}
|
||||
@@ -137,26 +141,11 @@ impl<'a, W: 'a + Write> CodecId for BitpackedFastFieldSerializer<'_, W> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
let mut out = vec![];
|
||||
BitpackedFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = BitpackedFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}",
|
||||
val, orig_val, name
|
||||
);
|
||||
}
|
||||
}
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
|
||||
&data, name,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -2,11 +2,14 @@
|
||||
#[macro_use]
|
||||
extern crate more_asserts;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
|
||||
pub trait CodecReader: Sized {
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
@@ -16,6 +19,35 @@ pub trait CodecReader: Sized {
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodecSerializer {
|
||||
/// returns an estimate of the compression ratio. if the compressor is unable to handle the
|
||||
/// data it needs to return f32::MAX.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
|
||||
fn create(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// `CodecId` is required by each Codec.
|
||||
///
|
||||
/// It needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
pub trait CodecId {
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
@@ -28,27 +60,6 @@ pub trait FastFieldDataAccess: Clone {
|
||||
fn get(&self, doc: u32) -> u64;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldSerializerEstimate {
|
||||
/// returns an estimate of the compression ratio. if the compressor is unable to handle the
|
||||
/// data it needs to return f32::MAX.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
}
|
||||
|
||||
/// `CodecId` is required by each Codec.
|
||||
///
|
||||
/// It needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
pub trait CodecId {
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
@@ -72,8 +83,37 @@ impl FastFieldDataAccess for Vec<u64> {
|
||||
mod tests {
|
||||
use crate::{
|
||||
bitpacked::BitpackedFastFieldSerializer, linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol::MultiLinearInterpolFastFieldSerializer,
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) {
|
||||
if S::estimate(&data, crate::tests::stats_from_vec(&data)) == f32::MAX {
|
||||
return;
|
||||
}
|
||||
let mut out = vec![];
|
||||
S::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
|
||||
val, orig_val, name, data
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
@@ -103,14 +143,19 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=200_u64).collect::<Vec<_>>();
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.1);
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
@@ -122,7 +167,7 @@ mod tests {
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
@@ -137,7 +182,7 @@ mod tests {
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::CodecId;
|
||||
use crate::CodecReader;
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
@@ -15,7 +15,7 @@ use tantivy_bitpacker::BitUnpacker;
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearinterpolFastFieldReader {
|
||||
pub struct LinearInterpolFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub footer: LinearInterpolFooter,
|
||||
pub slope: f32,
|
||||
@@ -61,7 +61,7 @@ impl FixedSize for LinearInterpolFooter {
|
||||
const SIZE_IN_BYTES: usize = 56;
|
||||
}
|
||||
|
||||
impl CodecReader for LinearinterpolFastFieldReader {
|
||||
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
@@ -70,7 +70,7 @@ impl CodecReader for LinearinterpolFastFieldReader {
|
||||
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearinterpolFastFieldReader {
|
||||
Ok(LinearInterpolFastFieldReader {
|
||||
bit_unpacker,
|
||||
footer,
|
||||
slope,
|
||||
@@ -96,9 +96,24 @@ impl CodecReader for LinearinterpolFastFieldReader {
|
||||
/// and stores the difference bitpacked.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
impl LinearInterpolFastFieldSerializer {
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
if num_vals <= 1 {
|
||||
return 0.0;
|
||||
}
|
||||
// We calculate the slope with f64 high precision and use the result in lower precision f32
|
||||
// This is done in order to handle estimations for very large values like i64::MAX
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
/// Creates a new fast field serializer.
|
||||
pub fn create(
|
||||
fn create(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
@@ -150,24 +165,6 @@ impl LinearInterpolFastFieldSerializer {
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
if num_vals <= 1 {
|
||||
return 0.0;
|
||||
}
|
||||
// We calculate the slope with f64 high precision and use the result in lower precision f32
|
||||
// This is done in order to handle estimations for very large values like i64::MAX
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
@@ -241,33 +238,11 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (u64, u64) {
|
||||
if LinearInterpolFastFieldSerializer::estimate(&data, crate::tests::stats_from_vec(&data))
|
||||
== f32::MAX
|
||||
{
|
||||
return (0, 0);
|
||||
}
|
||||
let mut out = vec![];
|
||||
LinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}",
|
||||
val, orig_val, name
|
||||
);
|
||||
}
|
||||
}
|
||||
(reader.footer.relative_max_value, reader.footer.offset)
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<
|
||||
LinearInterpolFastFieldSerializer,
|
||||
LinearInterpolFastFieldReader,
|
||||
>(&data, name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -303,10 +278,7 @@ mod tests {
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
let (rel_max_value, offset) = create_and_validate(&data, "simple monotonically");
|
||||
|
||||
assert_eq!(offset, 0);
|
||||
assert_eq!(rel_max_value, 0);
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::CodecId;
|
||||
use crate::CodecReader;
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
@@ -164,7 +164,7 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
|
||||
&interpolations[get_interpolation_position(doc)]
|
||||
}
|
||||
|
||||
impl CodecReader for MultiLinearinterpolFastFieldReader {
|
||||
impl FastFieldCodecReader for MultiLinearinterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
@@ -197,12 +197,22 @@ impl CodecReader for MultiLinearinterpolFastFieldReader {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
|
||||
impl MultiLinearInterpolFastFieldSerializer {
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
/// Creates a new fast field serializer.
|
||||
pub fn create(
|
||||
fn create(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
@@ -298,17 +308,7 @@ impl MultiLinearInterpolFastFieldSerializer {
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
impl FastFieldSerializerEstimate for MultiLinearInterpolFastFieldSerializer {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
@@ -389,33 +389,10 @@ mod tests {
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
if MultiLinearInterpolFastFieldSerializer::estimate(
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
) == f32::MAX
|
||||
{
|
||||
return;
|
||||
}
|
||||
let mut out = vec![];
|
||||
MultiLinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = MultiLinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
|
||||
val, orig_val, name, data
|
||||
);
|
||||
}
|
||||
}
|
||||
crate::tests::create_and_validate::<
|
||||
MultiLinearInterpolFastFieldSerializer,
|
||||
MultiLinearinterpolFastFieldReader,
|
||||
>(&data, name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializer;
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
@@ -154,7 +154,7 @@ impl MultiValuedFastFieldWriter {
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: BitpackedFastFieldSerializer<'_, _>;
|
||||
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
|
||||
@@ -10,12 +10,12 @@ use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::linearinterpol::LinearinterpolFastFieldReader;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearinterpolFastFieldReader;
|
||||
use fastfield_codecs::CodecId;
|
||||
use fastfield_codecs::CodecReader;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
@@ -69,7 +69,7 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
|
||||
/// Linear interpolated values + bitpacked
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearinterpolFastFieldReader>),
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearinterpolFastFieldReader>),
|
||||
}
|
||||
@@ -81,7 +81,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
let id = bytes.read_u8();
|
||||
|
||||
let reader = match id {
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::ID => {
|
||||
BitpackedFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
BitpackedReader,
|
||||
@@ -90,7 +90,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
LinearinterpolFastFieldReader,
|
||||
LinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
@@ -154,12 +154,12 @@ pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: CodecReader> FastFieldReaderCodecWrapper<Item, C> {
|
||||
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = u8::deserialize(&mut bytes)?;
|
||||
assert_eq!(BitpackedFastFieldSerializer::<Vec<u8>>::ID, id);
|
||||
assert_eq!(BitpackedFastFieldSerializer::ID, id);
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
/// Opens a fast field given the bytes.
|
||||
@@ -194,7 +194,7 @@ impl<Item: FastValue, C: CodecReader> FastFieldReaderCodecWrapper<Item, C> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: CodecReader + Clone> FastFieldReader<Item>
|
||||
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
|
||||
for FastFieldReaderCodecWrapper<Item, C>
|
||||
{
|
||||
/// Return the value associated to the given document.
|
||||
|
||||
@@ -6,10 +6,11 @@ use crate::schema::Field;
|
||||
use fastfield_codecs::CodecId;
|
||||
//pub use bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
pub use fastfield_codecs::FastFieldCodecSerializer;
|
||||
pub use fastfield_codecs::FastFieldDataAccess;
|
||||
pub use fastfield_codecs::FastFieldSerializerEstimate;
|
||||
pub use fastfield_codecs::FastFieldStats;
|
||||
use std::io::{self, Write};
|
||||
|
||||
@@ -60,12 +61,9 @@ impl CompositeFastFieldSerializer {
|
||||
|
||||
{
|
||||
let (ratio, name, id) = (
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(
|
||||
&fastfield_accessor,
|
||||
stats.clone(),
|
||||
),
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::NAME,
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::ID,
|
||||
BitpackedFastFieldSerializer::estimate(&fastfield_accessor, stats.clone()),
|
||||
BitpackedFastFieldSerializer::NAME,
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
@@ -107,12 +105,13 @@ impl CompositeFastFieldSerializer {
|
||||
); // todo print actual field name
|
||||
id.serialize(field_write)?;
|
||||
match name {
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::NAME => {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::create(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
@@ -147,7 +146,7 @@ impl CompositeFastFieldSerializer {
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
@@ -158,12 +157,12 @@ impl CompositeFastFieldSerializer {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
|
||||
let id = BitpackedFastFieldSerializer::<Vec<u8>>::ID;
|
||||
let id = BitpackedFastFieldSerializer::ID;
|
||||
id.serialize(field_write)?;
|
||||
BitpackedFastFieldSerializer::open(field_write, min_value, max_value)
|
||||
BitpackedFastFieldSerializerLegacy::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
|
||||
Reference in New Issue
Block a user