mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 07:42:54 +00:00
Compare commits
1 Commits
float
...
fix_estima
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ba3215b469 |
@@ -3,6 +3,7 @@ use std::io::{self, Write};
|
|||||||
use ownedbytes::OwnedBytes;
|
use ownedbytes::OwnedBytes;
|
||||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||||
|
|
||||||
|
use crate::column::EstimateColumn;
|
||||||
use crate::serialize::NormalizedHeader;
|
use crate::serialize::NormalizedHeader;
|
||||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||||
|
|
||||||
@@ -75,7 +76,7 @@ impl FastFieldCodec for BitpackedCodec {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn estimate(column: &dyn Column) -> Option<f32> {
|
fn estimate(column: &EstimateColumn) -> Option<f32> {
|
||||||
let num_bits = compute_num_bits(column.max_value());
|
let num_bits = compute_num_bits(column.max_value());
|
||||||
let num_bits_uncompressed = 64;
|
let num_bits_uncompressed = 64;
|
||||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use common::{BinarySerializable, CountingWriter, DeserializeFrom};
|
|||||||
use ownedbytes::OwnedBytes;
|
use ownedbytes::OwnedBytes;
|
||||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||||
|
|
||||||
|
use crate::column::EstimateColumn;
|
||||||
use crate::line::Line;
|
use crate::line::Line;
|
||||||
use crate::serialize::NormalizedHeader;
|
use crate::serialize::NormalizedHeader;
|
||||||
use crate::{Column, FastFieldCodec, FastFieldCodecType, VecColumn};
|
use crate::{Column, FastFieldCodec, FastFieldCodecType, VecColumn};
|
||||||
@@ -71,7 +72,7 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Estimate first_chunk and extrapolate
|
// Estimate first_chunk and extrapolate
|
||||||
fn estimate(column: &dyn crate::Column) -> Option<f32> {
|
fn estimate(column: &EstimateColumn) -> Option<f32> {
|
||||||
if column.num_vals() < 10 * CHUNK_SIZE as u64 {
|
if column.num_vals() < 10 * CHUNK_SIZE as u64 {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -137,6 +137,57 @@ where V: AsRef<[T]> + ?Sized
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Creates a view over a Column with a limited number of vals. Stats like min max are unchanged
|
||||||
|
pub struct EstimateColumn<'a> {
|
||||||
|
column: &'a dyn Column,
|
||||||
|
num_vals: u64,
|
||||||
|
}
|
||||||
|
impl<'a> EstimateColumn<'a> {
|
||||||
|
pub(crate) fn new(column: &'a dyn Column) -> Self {
|
||||||
|
let limit_num_vals = column.num_vals().min(100_000);
|
||||||
|
Self {
|
||||||
|
column,
|
||||||
|
num_vals: limit_num_vals,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Column for EstimateColumn<'a> {
|
||||||
|
fn get_val(&self, idx: u64) -> u64 {
|
||||||
|
(*self.column).get_val(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn min_value(&self) -> u64 {
|
||||||
|
(*self.column).min_value()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn max_value(&self) -> u64 {
|
||||||
|
(*self.column).max_value()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn num_vals(&self) -> u64 {
|
||||||
|
self.num_vals
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||||
|
Box::new((*self.column).iter().take(self.num_vals as usize))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_range(&self, start: u64, output: &mut [u64]) {
|
||||||
|
(*self.column).get_range(start, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&'a dyn Column> for EstimateColumn<'a> {
|
||||||
|
fn from(column: &'a dyn Column) -> Self {
|
||||||
|
let limit_num_vals = column.num_vals().min(100_000);
|
||||||
|
Self {
|
||||||
|
column,
|
||||||
|
num_vals: limit_num_vals,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct MonotonicMappingColumn<C, T, Input> {
|
struct MonotonicMappingColumn<C, T, Input> {
|
||||||
from_column: C,
|
from_column: C,
|
||||||
monotonic_mapping: T,
|
monotonic_mapping: T,
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ use std::io;
|
|||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use column::EstimateColumn;
|
||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
use compact_space::CompactSpaceDecompressor;
|
use compact_space::CompactSpaceDecompressor;
|
||||||
use ownedbytes::OwnedBytes;
|
use ownedbytes::OwnedBytes;
|
||||||
@@ -132,7 +133,7 @@ trait FastFieldCodec: 'static {
|
|||||||
///
|
///
|
||||||
/// It could make sense to also return a value representing
|
/// It could make sense to also return a value representing
|
||||||
/// computational complexity.
|
/// computational complexity.
|
||||||
fn estimate(column: &dyn Column) -> Option<f32>;
|
fn estimate(column: &EstimateColumn) -> Option<f32>;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
|
pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
|
||||||
@@ -149,6 +150,7 @@ mod tests {
|
|||||||
|
|
||||||
use crate::bitpacked::BitpackedCodec;
|
use crate::bitpacked::BitpackedCodec;
|
||||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||||
|
use crate::column::EstimateColumn;
|
||||||
use crate::linear::LinearCodec;
|
use crate::linear::LinearCodec;
|
||||||
use crate::serialize::Header;
|
use crate::serialize::Header;
|
||||||
|
|
||||||
@@ -159,7 +161,9 @@ mod tests {
|
|||||||
let col = &VecColumn::from(data);
|
let col = &VecColumn::from(data);
|
||||||
let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
|
let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
|
||||||
let normalized_col = header.normalize_column(col);
|
let normalized_col = header.normalize_column(col);
|
||||||
let estimation = Codec::estimate(&normalized_col)?;
|
|
||||||
|
let limited_column = EstimateColumn::new(&normalized_col);
|
||||||
|
let estimation = Codec::estimate(&limited_column)?;
|
||||||
|
|
||||||
let mut out = Vec::new();
|
let mut out = Vec::new();
|
||||||
let col = VecColumn::from(data);
|
let col = VecColumn::from(data);
|
||||||
@@ -280,14 +284,16 @@ mod tests {
|
|||||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||||
let data: VecColumn = data.as_slice().into();
|
let data: VecColumn = data.as_slice().into();
|
||||||
|
|
||||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
let linear_interpol_estimation =
|
||||||
|
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_le!(linear_interpol_estimation, 0.01);
|
assert_le!(linear_interpol_estimation, 0.01);
|
||||||
|
|
||||||
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
|
let multi_linear_interpol_estimation =
|
||||||
|
BlockwiseLinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||||
assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||||
|
|
||||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_lt!(linear_interpol_estimation, bitpacked_estimation);
|
assert_lt!(linear_interpol_estimation, bitpacked_estimation);
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
@@ -295,18 +301,20 @@ mod tests {
|
|||||||
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
|
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
|
||||||
|
|
||||||
let data: VecColumn = data.into();
|
let data: VecColumn = data.into();
|
||||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
let linear_interpol_estimation =
|
||||||
|
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_le!(linear_interpol_estimation, 0.34);
|
assert_le!(linear_interpol_estimation, 0.34);
|
||||||
|
|
||||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_lt!(bitpacked_estimation, linear_interpol_estimation);
|
assert_lt!(bitpacked_estimation, linear_interpol_estimation);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn estimation_prefer_bitpacked() {
|
fn estimation_prefer_bitpacked() {
|
||||||
let data = VecColumn::from(&[10, 10, 10, 10]);
|
let data = VecColumn::from(&[10, 10, 10, 10]);
|
||||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
let linear_interpol_estimation =
|
||||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
|
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_lt!(bitpacked_estimation, linear_interpol_estimation);
|
assert_lt!(bitpacked_estimation, linear_interpol_estimation);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -318,10 +326,11 @@ mod tests {
|
|||||||
|
|
||||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
let linear_interpol_estimation =
|
||||||
|
LinearCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_le!(linear_interpol_estimation, 0.35);
|
assert_le!(linear_interpol_estimation, 0.35);
|
||||||
|
|
||||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
let bitpacked_estimation = BitpackedCodec::estimate(&EstimateColumn::new(&data)).unwrap();
|
||||||
assert_le!(bitpacked_estimation, 0.32);
|
assert_le!(bitpacked_estimation, 0.32);
|
||||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,19 +67,11 @@ impl Line {
|
|||||||
self.intercept.wrapping_add(linear_part)
|
self.intercept.wrapping_add(linear_part)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Same as train, but the intercept is only estimated from provided sample positions
|
|
||||||
pub fn estimate(ys: &dyn Column, sample_positions: &[u64]) -> Self {
|
|
||||||
Self::train_from(
|
|
||||||
ys,
|
|
||||||
sample_positions
|
|
||||||
.iter()
|
|
||||||
.cloned()
|
|
||||||
.map(|pos| (pos, ys.get_val(pos))),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Intercept is only computed from provided positions
|
// Intercept is only computed from provided positions
|
||||||
fn train_from(ys: &dyn Column, positions_and_values: impl Iterator<Item = (u64, u64)>) -> Self {
|
pub fn train_from(
|
||||||
|
ys: &dyn Column,
|
||||||
|
positions_and_values: impl Iterator<Item = (u64, u64)>,
|
||||||
|
) -> Self {
|
||||||
let num_vals = if let Some(num_vals) = NonZeroU64::new(ys.num_vals() - 1) {
|
let num_vals = if let Some(num_vals) = NonZeroU64::new(ys.num_vals() - 1) {
|
||||||
num_vals
|
num_vals
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use common::BinarySerializable;
|
|||||||
use ownedbytes::OwnedBytes;
|
use ownedbytes::OwnedBytes;
|
||||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||||
|
|
||||||
|
use crate::column::EstimateColumn;
|
||||||
use crate::line::Line;
|
use crate::line::Line;
|
||||||
use crate::serialize::NormalizedHeader;
|
use crate::serialize::NormalizedHeader;
|
||||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||||
@@ -121,23 +122,23 @@ impl FastFieldCodec for LinearCodec {
|
|||||||
/// where the local maxima for the deviation of the calculated value are and
|
/// where the local maxima for the deviation of the calculated value are and
|
||||||
/// the offset to shift all values to >=0 is also unknown.
|
/// the offset to shift all values to >=0 is also unknown.
|
||||||
#[allow(clippy::question_mark)]
|
#[allow(clippy::question_mark)]
|
||||||
fn estimate(column: &dyn Column) -> Option<f32> {
|
fn estimate(column: &EstimateColumn) -> Option<f32> {
|
||||||
if column.num_vals() < 3 {
|
if column.num_vals() < 3 {
|
||||||
return None; // disable compressor for this case
|
return None; // disable compressor for this case
|
||||||
}
|
}
|
||||||
|
|
||||||
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
||||||
let num_vals = column.num_vals() as f32 / 100.0;
|
let num_vals = column.num_vals() as f32 / 100.0;
|
||||||
let sample_positions = (0..20)
|
let sample_positions_and_values = (0..20)
|
||||||
.map(|pos| (num_vals * pos as f32 * 5.0) as u64)
|
.map(|pos| (num_vals * pos as f32 * 5.0) as u64)
|
||||||
|
.map(|pos| (pos, column.get_val(pos)))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let line = Line::estimate(column, &sample_positions);
|
let line = { Line::train_from(column, sample_positions_and_values.iter().cloned()) };
|
||||||
|
|
||||||
let estimated_bit_width = sample_positions
|
let estimated_bit_width = sample_positions_and_values
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|pos| {
|
.map(|(pos, actual_value)| {
|
||||||
let actual_value = column.get_val(pos);
|
|
||||||
let interpolated_val = line.eval(pos as u64);
|
let interpolated_val = line.eval(pos as u64);
|
||||||
actual_value.wrapping_sub(interpolated_val)
|
actual_value.wrapping_sub(interpolated_val)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ use ownedbytes::OwnedBytes;
|
|||||||
|
|
||||||
use crate::bitpacked::BitpackedCodec;
|
use crate::bitpacked::BitpackedCodec;
|
||||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||||
|
use crate::column::EstimateColumn;
|
||||||
use crate::compact_space::CompactSpaceCompressor;
|
use crate::compact_space::CompactSpaceCompressor;
|
||||||
use crate::linear::LinearCodec;
|
use crate::linear::LinearCodec;
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -125,23 +126,6 @@ impl BinarySerializable for Header {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn estimate<T: MonotonicallyMappableToU64>(
|
|
||||||
typed_column: impl Column<T>,
|
|
||||||
codec_type: FastFieldCodecType,
|
|
||||||
) -> Option<f32> {
|
|
||||||
let column = monotonic_map_column(typed_column, T::to_u64);
|
|
||||||
let min_value = column.min_value();
|
|
||||||
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
|
|
||||||
.filter(|gcd| gcd.get() > 1u64);
|
|
||||||
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
|
|
||||||
let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
|
|
||||||
match codec_type {
|
|
||||||
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
|
|
||||||
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
|
|
||||||
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn serialize_u128(
|
pub fn serialize_u128(
|
||||||
typed_column: impl Column<u128>,
|
typed_column: impl Column<u128>,
|
||||||
output: &mut impl io::Write,
|
output: &mut impl io::Write,
|
||||||
@@ -177,10 +161,29 @@ pub fn serialize<T: MonotonicallyMappableToU64>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn estimate<T: MonotonicallyMappableToU64>(
|
||||||
|
typed_column: impl Column<T>,
|
||||||
|
codec_type: FastFieldCodecType,
|
||||||
|
) -> Option<f32> {
|
||||||
|
let column = monotonic_map_column(typed_column, T::to_u64);
|
||||||
|
let min_value = column.min_value();
|
||||||
|
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
|
||||||
|
.filter(|gcd| gcd.get() > 1u64);
|
||||||
|
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
|
||||||
|
let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
|
||||||
|
let estimate_column = EstimateColumn::new(&normalized_column);
|
||||||
|
match codec_type {
|
||||||
|
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&estimate_column),
|
||||||
|
FastFieldCodecType::Linear => LinearCodec::estimate(&estimate_column),
|
||||||
|
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&estimate_column),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn detect_codec(
|
fn detect_codec(
|
||||||
column: impl Column<u64>,
|
column: impl Column<u64>,
|
||||||
codecs: &[FastFieldCodecType],
|
codecs: &[FastFieldCodecType],
|
||||||
) -> Option<FastFieldCodecType> {
|
) -> Option<FastFieldCodecType> {
|
||||||
|
let column: EstimateColumn = EstimateColumn::new(&column);
|
||||||
let mut estimations = Vec::new();
|
let mut estimations = Vec::new();
|
||||||
for &codec in codecs {
|
for &codec in codecs {
|
||||||
let estimation_opt = match codec {
|
let estimation_opt = match codec {
|
||||||
|
|||||||
Reference in New Issue
Block a user