mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 02:22:54 +00:00
Merge pull request #1475 from quickwit-oss/extend_ff_access
move fastfield stats to trait
This commit is contained in:
@@ -45,7 +45,7 @@ mod tests {
|
||||
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = vec![];
|
||||
b.iter(|| {
|
||||
S::serialize(&mut bytes, &data, stats_from_vec(data)).unwrap();
|
||||
S::serialize(&mut bytes, &data).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
@@ -112,10 +112,12 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer =
|
||||
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
|
||||
let mut serializer = BitpackedFastFieldSerializerLegacy::open(
|
||||
write,
|
||||
fastfield_accessor.min_value(),
|
||||
fastfield_accessor.max_value(),
|
||||
)?;
|
||||
|
||||
for val in fastfield_accessor.iter() {
|
||||
serializer.add_val(val)?;
|
||||
@@ -124,14 +126,11 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
_stats: FastFieldStats,
|
||||
) -> bool {
|
||||
fn is_applicable(_fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
|
||||
@@ -28,14 +28,14 @@ pub trait FastFieldCodecSerializer {
|
||||
const ID: u8;
|
||||
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
///
|
||||
@@ -44,7 +44,6 @@ pub trait FastFieldCodecSerializer {
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
@@ -62,6 +61,15 @@ pub trait FastFieldDataAccess {
|
||||
|
||||
/// Returns a iterator over the data
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = u64> + 'a>;
|
||||
|
||||
/// min value of the data
|
||||
fn min_value(&self) -> u64;
|
||||
|
||||
/// max value of the data
|
||||
fn max_value(&self) -> u64;
|
||||
|
||||
/// num vals
|
||||
fn num_vals(&self) -> u64;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -80,6 +88,18 @@ impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
@@ -89,6 +109,17 @@ impl FastFieldDataAccess for Vec<u64> {
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new((self as &[u64]).iter().cloned())
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
self.iter().min().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.iter().max().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -103,12 +134,12 @@ mod tests {
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> (f32, f32) {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||
if !S::is_applicable(&data) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let estimation = S::estimate(&data);
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
S::serialize(&mut out, &data, crate::tests::stats_from_vec(data)).unwrap();
|
||||
S::serialize(&mut out, &data).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
|
||||
@@ -184,29 +215,25 @@ mod tests {
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let linear_interpol_estimation = LinearInterpolFastFieldSerializer::estimate(&data);
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data);
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let bitpacked_estimation = BitpackedFastFieldSerializer::estimate(&data);
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let linear_interpol_estimation = LinearInterpolFastFieldSerializer::estimate(&data);
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let bitpacked_estimation = BitpackedFastFieldSerializer::estimate(&data);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
@@ -216,12 +243,10 @@ mod tests {
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let linear_interpol_estimation = LinearInterpolFastFieldSerializer::estimate(&data);
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let bitpacked_estimation = BitpackedFastFieldSerializer::estimate(&data);
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use common::{BinarySerializable, FixedSize};
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
@@ -139,13 +139,12 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
|
||||
// calculate offset to ensure all values are positive
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
@@ -179,27 +178,25 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
offset,
|
||||
first_val,
|
||||
last_val,
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
num_vals: fastfield_accessor.num_vals(),
|
||||
min_value: fastfield_accessor.min_value(),
|
||||
max_value: fastfield_accessor.max_value(),
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 3 {
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
return false; // disable compressor for this case
|
||||
}
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algorithm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
let theorethical_maximum_offset =
|
||||
fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
if fastfield_accessor
|
||||
.max_value()
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
@@ -210,13 +207,13 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
||||
let num_vals = stats.num_vals as f32 / 100.0;
|
||||
let num_vals = fastfield_accessor.num_vals() as f32 / 100.0;
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (num_vals * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
@@ -238,9 +235,10 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64
|
||||
* fastfield_accessor.num_vals()
|
||||
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,13 +94,13 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
let is_applicable = S::is_applicable(&data);
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let estimation = S::estimate(&data);
|
||||
let mut out = vec![];
|
||||
S::serialize(&mut out, &data, stats_from_vec(data)).unwrap();
|
||||
S::serialize(&mut out, &data).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
|
||||
@@ -18,7 +18,7 @@ use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::linearinterpol::{get_calculated_value, get_slope};
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess};
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
|
||||
@@ -188,15 +188,14 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
|
||||
let mut first_function = Function {
|
||||
end_pos: stats.num_vals,
|
||||
end_pos: fastfield_accessor.num_vals(),
|
||||
value_start_pos: first_val,
|
||||
value_end_pos: last_val,
|
||||
..Default::default()
|
||||
@@ -271,29 +270,27 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = MultiLinearInterpolFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
num_vals: fastfield_accessor.num_vals(),
|
||||
min_value: fastfield_accessor.min_value(),
|
||||
max_value: fastfield_accessor.max_value(),
|
||||
interpolations,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 5_000 {
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool {
|
||||
if fastfield_accessor.num_vals() < 5_000 {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algorithm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
let theorethical_maximum_offset =
|
||||
fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
if fastfield_accessor
|
||||
.max_value()
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
@@ -304,15 +301,15 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = get_slope(
|
||||
first_val_in_first_block,
|
||||
last_val_in_first_block,
|
||||
stats.num_vals,
|
||||
fastfield_accessor.num_vals(),
|
||||
);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
@@ -339,10 +336,10 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * fastfield_accessor.num_vals() as u64
|
||||
// function metadata per block
|
||||
+ 29 * (stats.num_vals / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,14 +66,13 @@ impl From<FastFieldCodecName> for FastFieldCodecEnableCheck {
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
if !T::is_applicable(fastfield_accessor) {
|
||||
return;
|
||||
}
|
||||
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
|
||||
let (ratio, name, id) = (T::estimate(fastfield_accessor), T::NAME, T::ID);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
@@ -101,10 +100,9 @@ impl CompositeFastFieldSerializer {
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
) -> io::Result<()> {
|
||||
self.create_auto_detect_u64_fast_field_with_idx(field, stats, fastfield_accessor, 0)
|
||||
self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0)
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
@@ -120,12 +118,12 @@ impl CompositeFastFieldSerializer {
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let min_value = fastfield_accessor.min_value();
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
let gcd = find_gcd(fastfield_accessor.iter().map(|val| val - stats.min_value))
|
||||
let gcd = find_gcd(fastfield_accessor.iter().map(|val| val - min_value))
|
||||
.map(NonZeroU64::get)
|
||||
.unwrap_or(GCD_DEFAULT);
|
||||
|
||||
@@ -134,7 +132,6 @@ impl CompositeFastFieldSerializer {
|
||||
self.codec_enable_checker.clone(),
|
||||
field,
|
||||
field_write,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
);
|
||||
}
|
||||
@@ -142,42 +139,54 @@ impl CompositeFastFieldSerializer {
|
||||
Self::write_header(field_write, GCD_CODEC_ID)?;
|
||||
struct GCDWrappedFFAccess<T: FastFieldDataAccess> {
|
||||
fastfield_accessor: T,
|
||||
min_value: u64,
|
||||
base_value: u64,
|
||||
max_value: u64,
|
||||
num_vals: u64,
|
||||
gcd: u64,
|
||||
}
|
||||
impl<T: FastFieldDataAccess> FastFieldDataAccess for GCDWrappedFFAccess<T> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
(self.fastfield_accessor.get_val(position) - self.min_value) / self.gcd
|
||||
(self.fastfield_accessor.get_val(position) - self.base_value) / self.gcd
|
||||
}
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(
|
||||
self.fastfield_accessor
|
||||
.iter()
|
||||
.map(|val| (val - self.min_value) / self.gcd),
|
||||
.map(|val| (val - self.base_value) / self.gcd),
|
||||
)
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
0
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
let num_vals = fastfield_accessor.num_vals();
|
||||
let base_value = fastfield_accessor.min_value();
|
||||
let max_value = (fastfield_accessor.max_value() - fastfield_accessor.min_value()) / gcd;
|
||||
|
||||
let fastfield_accessor = GCDWrappedFFAccess {
|
||||
fastfield_accessor,
|
||||
min_value: stats.min_value,
|
||||
base_value,
|
||||
max_value,
|
||||
num_vals,
|
||||
gcd,
|
||||
};
|
||||
|
||||
let min_value = stats.min_value;
|
||||
let stats = FastFieldStats {
|
||||
min_value: 0,
|
||||
max_value: (stats.max_value - stats.min_value) / gcd,
|
||||
num_vals: stats.num_vals,
|
||||
};
|
||||
Self::create_auto_detect_u64_fast_field_with_idx_gcd(
|
||||
self.codec_enable_checker.clone(),
|
||||
field,
|
||||
field_write,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
)?;
|
||||
write_gcd_header(field_write, min_value, gcd)?;
|
||||
write_gcd_header(field_write, base_value, gcd)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -187,28 +196,24 @@ impl CompositeFastFieldSerializer {
|
||||
codec_enable_checker: FastFieldCodecEnableCheck,
|
||||
field: Field,
|
||||
field_write: &mut CountingWriter<W>,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
) -> io::Result<()> {
|
||||
let mut estimations = vec![];
|
||||
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecName::Bitpacked) {
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecName::LinearInterpol) {
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecName::BlockwiseLinearInterpol) {
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
@@ -233,20 +238,15 @@ impl CompositeFastFieldSerializer {
|
||||
Self::write_header(field_write, id)?;
|
||||
match name {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::serialize(field_write, &fastfield_accessor, stats)?;
|
||||
BitpackedFastFieldSerializer::serialize(field_write, &fastfield_accessor)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
LinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
)?;
|
||||
LinearInterpolFastFieldSerializer::serialize(field_write, &fastfield_accessor)?;
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
)?;
|
||||
}
|
||||
_ => {
|
||||
|
||||
@@ -359,17 +359,19 @@ impl IntFastFieldWriter {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
};
|
||||
let stats = FastFieldStats {
|
||||
min_value: min,
|
||||
max_value: max,
|
||||
num_vals: self.val_count as u64,
|
||||
};
|
||||
|
||||
serializer.create_auto_detect_u64_fast_field(self.field, stats, fastfield_accessor)?;
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
stats,
|
||||
};
|
||||
|
||||
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -379,6 +381,7 @@ impl IntFastFieldWriter {
|
||||
struct WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
doc_id_map: Option<&'map DocIdMapping>,
|
||||
vals: &'bitp BlockedBitpacker,
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
/// Return the value associated to the given doc.
|
||||
@@ -411,4 +414,16 @@ impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'b
|
||||
Box::new(self.vals.iter())
|
||||
}
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.stats.min_value
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.stats.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.stats.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
@@ -374,6 +374,7 @@ impl IndexMerger {
|
||||
struct SortedDocIdFieldAccessProvider<'a> {
|
||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
||||
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for SortedDocIdFieldAccessProvider<'a> {
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
@@ -395,16 +396,24 @@ impl IndexMerger {
|
||||
}),
|
||||
)
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
self.stats.min_value
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.stats.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.stats.num_vals
|
||||
}
|
||||
}
|
||||
let fastfield_accessor = SortedDocIdFieldAccessProvider {
|
||||
doc_id_mapping,
|
||||
fast_field_readers: &fast_field_readers,
|
||||
};
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
)?;
|
||||
};
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(field, fastfield_accessor)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -564,7 +573,37 @@ impl IndexMerger {
|
||||
}
|
||||
offsets.push(offset);
|
||||
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(field, stats, &offsets[..])?;
|
||||
#[derive(Clone)]
|
||||
struct FieldIndexAccessProvider<'a> {
|
||||
offsets: &'a [u64],
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for FieldIndexAccessProvider<'a> {
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
self.offsets[doc as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.offsets.iter().cloned())
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
self.stats.min_value
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.stats.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.stats.num_vals
|
||||
}
|
||||
}
|
||||
let fastfield_accessor = FieldIndexAccessProvider {
|
||||
offsets: &offsets,
|
||||
stats,
|
||||
};
|
||||
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(field, fastfield_accessor)?;
|
||||
Ok(offsets)
|
||||
}
|
||||
/// Returns the fastfield index (index for the data, not the data).
|
||||
@@ -737,6 +776,7 @@ impl IndexMerger {
|
||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
||||
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
|
||||
offsets: Vec<u64>,
|
||||
stats: FastFieldStats,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for SortedDocIdMultiValueAccessProvider<'a> {
|
||||
fn get_val(&self, pos: u64) -> u64 {
|
||||
@@ -777,15 +817,26 @@ impl IndexMerger {
|
||||
}),
|
||||
)
|
||||
}
|
||||
fn min_value(&self) -> u64 {
|
||||
self.stats.min_value
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.stats.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.stats.num_vals
|
||||
}
|
||||
}
|
||||
let fastfield_accessor = SortedDocIdMultiValueAccessProvider {
|
||||
doc_id_mapping,
|
||||
fast_field_readers: &ff_readers,
|
||||
offsets,
|
||||
stats,
|
||||
};
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
1,
|
||||
)?;
|
||||
|
||||
Reference in New Issue
Block a user