mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 18:10:42 +00:00
CR
This commit is contained in:
@@ -22,6 +22,7 @@ impl Column for BitpackedReader {
|
||||
}
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
// The BitpackedReader assumes a normalized vector.
|
||||
0
|
||||
}
|
||||
#[inline]
|
||||
@@ -58,19 +59,24 @@ impl FastFieldCodec for BitpackedCodec {
|
||||
|
||||
/// Serializes data with the BitpackedFastFieldSerializer.
|
||||
///
|
||||
/// The bitpacker assumes that the column has been normalized.
|
||||
/// i.e. It has already been shifted by its minimum value, so that its
|
||||
/// current minimum value is 0.
|
||||
///
|
||||
/// Ideally, we made a shift upstream on the column so that `col.min_value() == 0`.
|
||||
fn serialize(col: &dyn Column, write: &mut impl Write) -> io::Result<()> {
|
||||
let num_bits = compute_num_bits(col.max_value());
|
||||
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> {
|
||||
assert_eq!(column.min_value(), 0u64);
|
||||
let num_bits = compute_num_bits(column.max_value());
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for val in col.iter() {
|
||||
for val in column.iter() {
|
||||
bit_packer.write(val, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn estimate(col: &impl Column) -> Option<f32> {
|
||||
let num_bits = compute_num_bits(col.max_value());
|
||||
fn estimate(column: &impl Column) -> Option<f32> {
|
||||
let num_bits = compute_num_bits(column.max_value());
|
||||
let num_bits_uncompressed = 64;
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
|
||||
@@ -71,14 +71,11 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
}
|
||||
|
||||
// Estimate first_chunk and extrapolate
|
||||
fn estimate(fastfield_accessor: &impl crate::Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE as u64 {
|
||||
fn estimate(column: &impl crate::Column) -> Option<f32> {
|
||||
if column.num_vals() < 10 * CHUNK_SIZE as u64 {
|
||||
return None;
|
||||
}
|
||||
let mut first_chunk: Vec<u64> = fastfield_accessor
|
||||
.iter()
|
||||
.take(CHUNK_SIZE as usize)
|
||||
.collect();
|
||||
let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect();
|
||||
let line = Line::train(&VecColumn::from(&first_chunk));
|
||||
for (i, buffer_val) in first_chunk.iter_mut().enumerate() {
|
||||
let interpolated_val = line.eval(i as u64);
|
||||
@@ -96,24 +93,23 @@ impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
Block::default().serialize(&mut out).unwrap();
|
||||
out.len()
|
||||
};
|
||||
let num_bits = estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64
|
||||
let num_bits = estimated_bit_width as u64 * column.num_vals() as u64
|
||||
// function metadata per block
|
||||
+ metadata_per_block as u64 * (fastfield_accessor.num_vals() / CHUNK_SIZE as u64);
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
+ metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64);
|
||||
let num_bits_uncompressed = 64 * column.num_vals();
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
fastfield_accessor: &dyn crate::Column,
|
||||
wrt: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
fn serialize(column: &dyn crate::Column, wrt: &mut impl io::Write) -> io::Result<()> {
|
||||
// The BitpackedReader assumes a normalized vector.
|
||||
assert_eq!(column.min_value(), 0);
|
||||
let mut buffer = Vec::with_capacity(CHUNK_SIZE);
|
||||
let num_vals = fastfield_accessor.num_vals();
|
||||
let num_vals = column.num_vals();
|
||||
|
||||
let num_blocks = compute_num_blocks(num_vals);
|
||||
let mut blocks = Vec::with_capacity(num_blocks);
|
||||
|
||||
let mut vals = fastfield_accessor.iter();
|
||||
let mut vals = column.iter();
|
||||
|
||||
let mut bit_packer = BitPacker::new();
|
||||
|
||||
@@ -176,6 +172,7 @@ impl Column for BlockwiseLinearReader {
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
// The BlockwiseLinearReader assumes a normalized vector.
|
||||
0u64
|
||||
}
|
||||
|
||||
|
||||
@@ -34,16 +34,18 @@ pub trait Column<T = u64> {
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
/// This min_value may not be exact.
|
||||
/// For instance, the min value does not take in account of possible
|
||||
/// deleted document. All values are however guaranteed to be higher than
|
||||
/// `.min_value()`.
|
||||
fn min_value(&self) -> T;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value
|
||||
/// This max_value may not be exact.
|
||||
/// For instance, the max value does not take in account of possible
|
||||
/// deleted document. All values are however guaranteed to be higher than
|
||||
/// `.max_value()`.
|
||||
fn max_value(&self) -> T;
|
||||
|
||||
fn num_vals(&self) -> u64;
|
||||
|
||||
@@ -13,17 +13,17 @@ use std::io::Write;
|
||||
use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod blockwise_linear;
|
||||
mod bitpacked;
|
||||
mod blockwise_linear;
|
||||
pub(crate) mod line;
|
||||
pub mod linear;
|
||||
mod linear;
|
||||
|
||||
mod column;
|
||||
mod gcd;
|
||||
mod serialize;
|
||||
|
||||
pub use self::column::{monotonic_map_column, Column, VecColumn};
|
||||
pub use self::serialize::{open, serialize, serialize_and_load, NormalizedHeader};
|
||||
pub use self::serialize::{estimate, open, serialize, serialize_and_load, NormalizedHeader};
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
@@ -124,7 +124,7 @@ impl MonotonicallyMappableToU64 for f64 {
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodec: 'static {
|
||||
trait FastFieldCodec: 'static {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const CODEC_TYPE: FastFieldCodecType;
|
||||
|
||||
@@ -27,6 +27,7 @@ impl Column for LinearReader {
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
// The LinearReader assumes a normalized vector.
|
||||
0u64
|
||||
}
|
||||
|
||||
@@ -84,11 +85,11 @@ impl FastFieldCodec for LinearCodec {
|
||||
}
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(fastfield_accessor: &dyn Column, write: &mut impl Write) -> io::Result<()> {
|
||||
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
|
||||
let line = Line::train(fastfield_accessor);
|
||||
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> {
|
||||
assert_eq!(column.min_value(), 0);
|
||||
let line = Line::train(column);
|
||||
|
||||
let max_offset_from_line = fastfield_accessor
|
||||
let max_offset_from_line = column
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(pos, actual_value)| {
|
||||
@@ -106,7 +107,7 @@ impl FastFieldCodec for LinearCodec {
|
||||
linear_params.serialize(write)?;
|
||||
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, actual_value) in fastfield_accessor.iter().enumerate() {
|
||||
for (pos, actual_value) in column.iter().enumerate() {
|
||||
let calculated_value = line.eval(pos as u64);
|
||||
let offset = actual_value.wrapping_sub(calculated_value);
|
||||
bit_packer.write(offset, num_bits, write)?;
|
||||
@@ -120,23 +121,23 @@ impl FastFieldCodec for LinearCodec {
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
fn estimate(column: &impl Column) -> Option<f32> {
|
||||
if column.num_vals() < 3 {
|
||||
return None; // disable compressor for this case
|
||||
}
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
||||
let num_vals = fastfield_accessor.num_vals() as f32 / 100.0;
|
||||
let num_vals = column.num_vals() as f32 / 100.0;
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (num_vals * pos as f32 * 5.0) as u64)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let line = Line::estimate(fastfield_accessor, &sample_positions);
|
||||
let line = Line::estimate(column, &sample_positions);
|
||||
|
||||
let estimated_bit_width = sample_positions
|
||||
.into_iter()
|
||||
.map(|pos| {
|
||||
let actual_value = fastfield_accessor.get_val(pos);
|
||||
let actual_value = column.get_val(pos);
|
||||
let interpolated_val = line.eval(pos as u64);
|
||||
actual_value.wrapping_sub(interpolated_val)
|
||||
})
|
||||
@@ -145,8 +146,8 @@ impl FastFieldCodec for LinearCodec {
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
let num_bits = (estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64) + 64;
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
let num_bits = (estimated_bit_width as u64 * column.num_vals() as u64) + 64;
|
||||
let num_bits_uncompressed = 64 * column.num_vals();
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,35 +1,8 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use fastfield_codecs::{Column, FastFieldCodecType, FastFieldStats, VecColumn};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
struct Data<'a>(&'a [u64]);
|
||||
|
||||
impl<'a> Column for Data<'a> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
*self.0.iter().min().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
*self.0.iter().max().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
@@ -38,10 +11,9 @@ fn main() {
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let results: Vec<(f32, f32, FastFieldCodecType)> = [
|
||||
serialize_with_codec::<LinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BitpackedCodec>(&data),
|
||||
serialize_with_codec(&data, FastFieldCodecType::Bitpacked),
|
||||
serialize_with_codec(&data, FastFieldCodecType::Linear),
|
||||
serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear),
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
@@ -107,15 +79,16 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<C: FastFieldCodec>(
|
||||
pub fn serialize_with_codec(
|
||||
data: &[u64],
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> Option<(f32, f32, FastFieldCodecType)> {
|
||||
let data = Data(data);
|
||||
let estimation = C::estimate(&data)?;
|
||||
let col = VecColumn::from(data);
|
||||
let estimation = fastfield_codecs::estimate(&col, codec_type)?;
|
||||
let mut out = Vec::new();
|
||||
C::serialize(&data, &mut out).unwrap();
|
||||
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
|
||||
Some((estimation, actual_compression, C::CODEC_TYPE))
|
||||
fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?;
|
||||
let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32;
|
||||
Some((estimation, actual_compression, codec_type))
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -34,17 +34,11 @@ use crate::{
|
||||
VecColumn, ALL_CODEC_TYPES,
|
||||
};
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<C: FastFieldCodec, D: Column>(
|
||||
fastfield_accessor: &D,
|
||||
estimations: &mut Vec<(f32, FastFieldCodecType)>,
|
||||
) {
|
||||
if let Some(ratio) = C::estimate(fastfield_accessor) {
|
||||
estimations.push((ratio, C::CODEC_TYPE));
|
||||
}
|
||||
}
|
||||
|
||||
/// The normalized header gives some parameters after applying the following
|
||||
/// normalization of the vector:
|
||||
/// val -> (val - min_value) / gcd
|
||||
///
|
||||
/// By design, after normalization, `min_value = 0` and `gcd = 1`.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct NormalizedHeader {
|
||||
pub num_vals: u64,
|
||||
@@ -160,6 +154,23 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
|
||||
}
|
||||
}
|
||||
|
||||
pub fn estimate<T: MonotonicallyMappableToU64>(
|
||||
typed_column: impl Column<T>,
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> Option<f32> {
|
||||
let column = monotonic_map_column(typed_column, T::to_u64);
|
||||
let min_value = column.min_value();
|
||||
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
|
||||
.filter(|gcd| gcd.get() > 1u64);
|
||||
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
|
||||
let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
|
||||
match codec_type {
|
||||
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
|
||||
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
|
||||
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serialize<T: MonotonicallyMappableToU64>(
|
||||
typed_column: impl Column<T>,
|
||||
output: &mut impl io::Write,
|
||||
@@ -188,16 +199,13 @@ fn detect_codec(
|
||||
) -> Option<FastFieldCodecType> {
|
||||
let mut estimations = Vec::new();
|
||||
for &codec in codecs {
|
||||
match codec {
|
||||
FastFieldCodecType::Bitpacked => {
|
||||
codec_estimation::<BitpackedCodec, _>(&column, &mut estimations);
|
||||
}
|
||||
FastFieldCodecType::Linear => {
|
||||
codec_estimation::<LinearCodec, _>(&column, &mut estimations);
|
||||
}
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
codec_estimation::<BlockwiseLinearCodec, _>(&column, &mut estimations);
|
||||
}
|
||||
let estimation_opt = match codec {
|
||||
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&column),
|
||||
FastFieldCodecType::Linear => LinearCodec::estimate(&column),
|
||||
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&column),
|
||||
};
|
||||
if let Some(estimation) = estimation_opt {
|
||||
estimations.push((estimation, codec));
|
||||
}
|
||||
}
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) {
|
||||
|
||||
@@ -332,13 +332,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_multivalue_get_vals() {
|
||||
let doc_id_mapping = DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
|
||||
let doc_id_mapping =
|
||||
DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
|
||||
assert_eq!(doc_id_mapping.num_old_doc_ids(), 10);
|
||||
let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,][..]);
|
||||
let multivalue_start_index = MultivalueStartIndex::new(
|
||||
&col,
|
||||
&doc_id_mapping,
|
||||
);
|
||||
let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55][..]);
|
||||
let multivalue_start_index = MultivalueStartIndex::new(&col, &doc_id_mapping);
|
||||
assert_eq!(
|
||||
multivalue_start_index.iter().collect::<Vec<u64>>(),
|
||||
vec![0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55]
|
||||
@@ -351,5 +349,4 @@ mod tests {
|
||||
assert_eq!(multivalue_start_index.get_val(0), 0);
|
||||
assert_eq!(multivalue_start_index.get_val(10), 55);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use std::io::{self, Write};
|
||||
|
||||
use common::{BinarySerializable, CountingWriter};
|
||||
pub use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats};
|
||||
pub use fastfield_codecs::{Column, FastFieldStats};
|
||||
use fastfield_codecs::{FastFieldCodecType, MonotonicallyMappableToU64, ALL_CODEC_TYPES};
|
||||
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
|
||||
Reference in New Issue
Block a user