This commit is contained in:
Paul Masurel
2022-09-05 23:21:12 +09:00
parent c632fc014e
commit c5d30a54bc
9 changed files with 94 additions and 111 deletions

View File

@@ -22,6 +22,7 @@ impl Column for BitpackedReader {
}
#[inline]
fn min_value(&self) -> u64 {
// The BitpackedReader assumes a normalized vector.
0
}
#[inline]
@@ -58,19 +59,24 @@ impl FastFieldCodec for BitpackedCodec {
/// Serializes data with the BitpackedFastFieldSerializer.
///
/// The bitpacker assumes that the column has been normalized.
/// i.e. It has already been shifted by its minimum value, so that its
/// current minimum value is 0.
///
/// Ideally, we made a shift upstream on the column so that `col.min_value() == 0`.
fn serialize(col: &dyn Column, write: &mut impl Write) -> io::Result<()> {
let num_bits = compute_num_bits(col.max_value());
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> {
assert_eq!(column.min_value(), 0u64);
let num_bits = compute_num_bits(column.max_value());
let mut bit_packer = BitPacker::new();
for val in col.iter() {
for val in column.iter() {
bit_packer.write(val, num_bits, write)?;
}
bit_packer.close(write)?;
Ok(())
}
fn estimate(col: &impl Column) -> Option<f32> {
let num_bits = compute_num_bits(col.max_value());
fn estimate(column: &impl Column) -> Option<f32> {
let num_bits = compute_num_bits(column.max_value());
let num_bits_uncompressed = 64;
Some(num_bits as f32 / num_bits_uncompressed as f32)
}

View File

@@ -71,14 +71,11 @@ impl FastFieldCodec for BlockwiseLinearCodec {
}
// Estimate first_chunk and extrapolate
fn estimate(fastfield_accessor: &impl crate::Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE as u64 {
fn estimate(column: &impl crate::Column) -> Option<f32> {
if column.num_vals() < 10 * CHUNK_SIZE as u64 {
return None;
}
let mut first_chunk: Vec<u64> = fastfield_accessor
.iter()
.take(CHUNK_SIZE as usize)
.collect();
let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect();
let line = Line::train(&VecColumn::from(&first_chunk));
for (i, buffer_val) in first_chunk.iter_mut().enumerate() {
let interpolated_val = line.eval(i as u64);
@@ -96,24 +93,23 @@ impl FastFieldCodec for BlockwiseLinearCodec {
Block::default().serialize(&mut out).unwrap();
out.len()
};
let num_bits = estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64
let num_bits = estimated_bit_width as u64 * column.num_vals() as u64
// function metadata per block
+ metadata_per_block as u64 * (fastfield_accessor.num_vals() / CHUNK_SIZE as u64);
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
+ metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64);
let num_bits_uncompressed = 64 * column.num_vals();
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
fn serialize(
fastfield_accessor: &dyn crate::Column,
wrt: &mut impl io::Write,
) -> io::Result<()> {
fn serialize(column: &dyn crate::Column, wrt: &mut impl io::Write) -> io::Result<()> {
// The BitpackedReader assumes a normalized vector.
assert_eq!(column.min_value(), 0);
let mut buffer = Vec::with_capacity(CHUNK_SIZE);
let num_vals = fastfield_accessor.num_vals();
let num_vals = column.num_vals();
let num_blocks = compute_num_blocks(num_vals);
let mut blocks = Vec::with_capacity(num_blocks);
let mut vals = fastfield_accessor.iter();
let mut vals = column.iter();
let mut bit_packer = BitPacker::new();
@@ -176,6 +172,7 @@ impl Column for BlockwiseLinearReader {
}
fn min_value(&self) -> u64 {
// The BlockwiseLinearReader assumes a normalized vector.
0u64
}

View File

@@ -34,16 +34,18 @@ pub trait Column<T = u64> {
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
/// This min_value may not be exact.
/// For instance, the min value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.min_value()`.
fn min_value(&self) -> T;
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value
/// This max_value may not be exact.
/// For instance, the max value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.max_value()`.
fn max_value(&self) -> T;
fn num_vals(&self) -> u64;

View File

@@ -13,17 +13,17 @@ use std::io::Write;
use common::BinarySerializable;
use ownedbytes::OwnedBytes;
pub mod bitpacked;
pub mod blockwise_linear;
mod bitpacked;
mod blockwise_linear;
pub(crate) mod line;
pub mod linear;
mod linear;
mod column;
mod gcd;
mod serialize;
pub use self::column::{monotonic_map_column, Column, VecColumn};
pub use self::serialize::{open, serialize, serialize_and_load, NormalizedHeader};
pub use self::serialize::{estimate, open, serialize, serialize_and_load, NormalizedHeader};
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
#[repr(u8)]
@@ -124,7 +124,7 @@ impl MonotonicallyMappableToU64 for f64 {
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodec: 'static {
trait FastFieldCodec: 'static {
/// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization.
const CODEC_TYPE: FastFieldCodecType;

View File

@@ -27,6 +27,7 @@ impl Column for LinearReader {
#[inline]
fn min_value(&self) -> u64 {
// The LinearReader assumes a normalized vector.
0u64
}
@@ -84,11 +85,11 @@ impl FastFieldCodec for LinearCodec {
}
/// Creates a new fast field serializer.
fn serialize(fastfield_accessor: &dyn Column, write: &mut impl Write) -> io::Result<()> {
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
let line = Line::train(fastfield_accessor);
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> {
assert_eq!(column.min_value(), 0);
let line = Line::train(column);
let max_offset_from_line = fastfield_accessor
let max_offset_from_line = column
.iter()
.enumerate()
.map(|(pos, actual_value)| {
@@ -106,7 +107,7 @@ impl FastFieldCodec for LinearCodec {
linear_params.serialize(write)?;
let mut bit_packer = BitPacker::new();
for (pos, actual_value) in fastfield_accessor.iter().enumerate() {
for (pos, actual_value) in column.iter().enumerate() {
let calculated_value = line.eval(pos as u64);
let offset = actual_value.wrapping_sub(calculated_value);
bit_packer.write(offset, num_bits, write)?;
@@ -120,23 +121,23 @@ impl FastFieldCodec for LinearCodec {
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
#[allow(clippy::question_mark)]
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
if fastfield_accessor.num_vals() < 3 {
fn estimate(column: &impl Column) -> Option<f32> {
if column.num_vals() < 3 {
return None; // disable compressor for this case
}
// let's sample at 0%, 5%, 10% .. 95%, 100%
let num_vals = fastfield_accessor.num_vals() as f32 / 100.0;
let num_vals = column.num_vals() as f32 / 100.0;
let sample_positions = (0..20)
.map(|pos| (num_vals * pos as f32 * 5.0) as u64)
.collect::<Vec<_>>();
let line = Line::estimate(fastfield_accessor, &sample_positions);
let line = Line::estimate(column, &sample_positions);
let estimated_bit_width = sample_positions
.into_iter()
.map(|pos| {
let actual_value = fastfield_accessor.get_val(pos);
let actual_value = column.get_val(pos);
let interpolated_val = line.eval(pos as u64);
actual_value.wrapping_sub(interpolated_val)
})
@@ -145,8 +146,8 @@ impl FastFieldCodec for LinearCodec {
.max()
.unwrap_or(0);
let num_bits = (estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64) + 64;
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
let num_bits = (estimated_bit_width as u64 * column.num_vals() as u64) + 64;
let num_bits_uncompressed = 64 * column.num_vals();
Some(num_bits as f32 / num_bits_uncompressed as f32)
}
}

View File

@@ -1,35 +1,8 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
use fastfield_codecs::{Column, FastFieldCodecType, FastFieldStats, VecColumn};
use prettytable::{Cell, Row, Table};
struct Data<'a>(&'a [u64]);
impl<'a> Column for Data<'a> {
fn get_val(&self, position: u64) -> u64 {
self.0[position as usize]
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
Box::new(self.0.iter().cloned())
}
fn min_value(&self) -> u64 {
*self.0.iter().min().unwrap_or(&0)
}
fn max_value(&self) -> u64 {
*self.0.iter().max().unwrap_or(&0)
}
fn num_vals(&self) -> u64 {
self.0.len() as u64
}
}
fn main() {
let mut table = Table::new();
@@ -38,10 +11,9 @@ fn main() {
for (data, data_set_name) in get_codec_test_data_sets() {
let results: Vec<(f32, f32, FastFieldCodecType)> = [
serialize_with_codec::<LinearCodec>(&data),
serialize_with_codec::<BlockwiseLinearCodec>(&data),
serialize_with_codec::<BlockwiseLinearCodec>(&data),
serialize_with_codec::<BitpackedCodec>(&data),
serialize_with_codec(&data, FastFieldCodecType::Bitpacked),
serialize_with_codec(&data, FastFieldCodecType::Linear),
serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear),
]
.into_iter()
.flatten()
@@ -107,15 +79,16 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
data_and_names
}
pub fn serialize_with_codec<C: FastFieldCodec>(
pub fn serialize_with_codec(
data: &[u64],
codec_type: FastFieldCodecType,
) -> Option<(f32, f32, FastFieldCodecType)> {
let data = Data(data);
let estimation = C::estimate(&data)?;
let col = VecColumn::from(data);
let estimation = fastfield_codecs::estimate(&col, codec_type)?;
let mut out = Vec::new();
C::serialize(&data, &mut out).unwrap();
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
Some((estimation, actual_compression, C::CODEC_TYPE))
fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?;
let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32;
Some((estimation, actual_compression, codec_type))
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -34,17 +34,11 @@ use crate::{
VecColumn, ALL_CODEC_TYPES,
};
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<C: FastFieldCodec, D: Column>(
fastfield_accessor: &D,
estimations: &mut Vec<(f32, FastFieldCodecType)>,
) {
if let Some(ratio) = C::estimate(fastfield_accessor) {
estimations.push((ratio, C::CODEC_TYPE));
}
}
/// The normalized header gives some parameters after applying the following
/// normalization of the vector:
/// val -> (val - min_value) / gcd
///
/// By design, after normalization, `min_value = 0` and `gcd = 1`.
#[derive(Debug, Copy, Clone)]
pub struct NormalizedHeader {
pub num_vals: u64,
@@ -160,6 +154,23 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
}
}
pub fn estimate<T: MonotonicallyMappableToU64>(
typed_column: impl Column<T>,
codec_type: FastFieldCodecType,
) -> Option<f32> {
let column = monotonic_map_column(typed_column, T::to_u64);
let min_value = column.min_value();
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
.filter(|gcd| gcd.get() > 1u64);
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
match codec_type {
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
}
}
pub fn serialize<T: MonotonicallyMappableToU64>(
typed_column: impl Column<T>,
output: &mut impl io::Write,
@@ -188,16 +199,13 @@ fn detect_codec(
) -> Option<FastFieldCodecType> {
let mut estimations = Vec::new();
for &codec in codecs {
match codec {
FastFieldCodecType::Bitpacked => {
codec_estimation::<BitpackedCodec, _>(&column, &mut estimations);
}
FastFieldCodecType::Linear => {
codec_estimation::<LinearCodec, _>(&column, &mut estimations);
}
FastFieldCodecType::BlockwiseLinear => {
codec_estimation::<BlockwiseLinearCodec, _>(&column, &mut estimations);
}
let estimation_opt = match codec {
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&column),
FastFieldCodecType::Linear => LinearCodec::estimate(&column),
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&column),
};
if let Some(estimation) = estimation_opt {
estimations.push((estimation, codec));
}
}
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) {

View File

@@ -332,13 +332,11 @@ mod tests {
#[test]
fn test_multivalue_get_vals() {
let doc_id_mapping = DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
let doc_id_mapping =
DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
assert_eq!(doc_id_mapping.num_old_doc_ids(), 10);
let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,][..]);
let multivalue_start_index = MultivalueStartIndex::new(
&col,
&doc_id_mapping,
);
let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55][..]);
let multivalue_start_index = MultivalueStartIndex::new(&col, &doc_id_mapping);
assert_eq!(
multivalue_start_index.iter().collect::<Vec<u64>>(),
vec![0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55]
@@ -351,5 +349,4 @@ mod tests {
assert_eq!(multivalue_start_index.get_val(0), 0);
assert_eq!(multivalue_start_index.get_val(10), 55);
}
}

View File

@@ -1,8 +1,7 @@
use std::io::{self, Write};
use common::{BinarySerializable, CountingWriter};
pub use fastfield_codecs::bitpacked::BitpackedCodec;
pub use fastfield_codecs::{Column, FastFieldCodec, FastFieldStats};
pub use fastfield_codecs::{Column, FastFieldStats};
use fastfield_codecs::{FastFieldCodecType, MonotonicallyMappableToU64, ALL_CODEC_TYPES};
use crate::directory::{CompositeWrite, WritePtr};