Put deprecated attributes on deprecated codecs. Clean.

This commit is contained in:
François Massot
2021-12-08 12:02:06 +01:00
parent 977f01a8a3
commit 4d66a3f0a0
10 changed files with 68 additions and 159 deletions

View File

@@ -6,8 +6,6 @@ license = "MIT"
edition = "2018"
description = "Fast field codecs used by tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
@@ -19,6 +17,6 @@ more-asserts = "0.2.1"
rand = "0.8.3"
[features]
unstable = [] # useful for benches and experimental codecs.
bin = ["prettytable-rs", "rand"]
default = ["bin"]

View File

@@ -4,14 +4,10 @@ extern crate test;
#[cfg(test)]
mod tests {
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
use fastfield_codecs::linearinterpol::{
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
use fastfield_codecs::{
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
*, piecewise_linear::{PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader},
};
use fastfield_codecs::multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
};
use fastfield_codecs::*;
fn get_data() -> Vec<u64> {
let mut data: Vec<_> = (100..55000_u64)
@@ -70,14 +66,9 @@ mod tests {
bench_create::<BitpackedFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
fn bench_fastfield_piecewise_linear_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
bench_create::<PiecewiseLinearFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
@@ -85,14 +76,9 @@ mod tests {
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
fn bench_fastfield_piecewise_linear_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
bench_get::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>(
b, &data,
);
}

View File

@@ -6,6 +6,7 @@ use std::io;
use std::io::Write;
pub mod bitpacked;
#[cfg(feature = "unstable")]
pub mod frame_of_reference;
pub mod linearinterpol;
pub mod multilinearinterpol;
@@ -93,11 +94,6 @@ impl FastFieldDataAccess for Vec<u64> {
mod tests {
use crate::{
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer},
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
},
piecewise_linear::{PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer},
};
@@ -168,25 +164,12 @@ mod tests {
fn test_codec_bitpacking() {
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
}
#[test]
fn test_codec_interpolation() {
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
}
#[test]
fn test_codec_multi_interpolation() {
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
}
#[test]
fn test_codec_piecewise_linear() {
test_codec::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>();
}
#[test]
fn test_codec_for() {
test_codec::<FORFastFieldSerializer, FORFastFieldReader>();
}
use super::*;
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
@@ -202,57 +185,50 @@ mod tests {
fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>();
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate_compression_ratio(
let piecewise_interpol_estimation =
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
&data,
stats_from_vec(&data),
);
assert_le!(linear_interpol_estimation, 0.01);
let multi_linear_interpol_estimation =
MultiLinearInterpolFastFieldSerializer::estimate_compression_ratio(
&data,
stats_from_vec(&data),
);
assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
assert_le!(piecewise_interpol_estimation, 0.2);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, bitpacked_estimation);
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data = vec![200, 10, 10, 10, 10, 1000, 20];
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate_compression_ratio(
let piecewise_interpol_estimation =
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
&data,
stats_from_vec(&data),
);
assert_le!(linear_interpol_estimation, 0.32);
assert_le!(piecewise_interpol_estimation, 0.32);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation, linear_interpol_estimation);
assert_le!(bitpacked_estimation, piecewise_interpol_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
fn estimation_test_interpolation_case_monotonically_increasing() {
let mut data = (200..=20000_u64).collect::<Vec<_>>();
data.push(1_000_000);
// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate_compression_ratio(
let piecewise_interpol_estimation =
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
&data,
stats_from_vec(&data),
);
assert_le!(linear_interpol_estimation, 0.35);
assert_le!(piecewise_interpol_estimation, 0.2);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
println!("{}", bitpacked_estimation);
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
}
}

View File

@@ -88,6 +88,10 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
/// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference bitpacked.
///
#[deprecated(
note = "Linear interpolation works best only on very rare cases and piecewise linear codec already works great on them."
)]
pub struct LinearInterpolFastFieldSerializer {}
#[inline]
@@ -105,6 +109,7 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
first_val + (pos as f32 * slope) as u64
}
#[allow(deprecated)]
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
const NAME: &'static str = "LinearInterpol";
const ID: u8 = 2;
@@ -235,6 +240,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
}
}
#[allow(deprecated)]
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -1,18 +1,14 @@
#[macro_use]
extern crate prettytable;
use common::f64_to_u64;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
#[cfg(feature = "unstable")]
use fastfield_codecs::frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer};
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
use fastfield_codecs::piecewise_linear::{
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
};
use fastfield_codecs::FastFieldCodecReader;
use fastfield_codecs::{
linearinterpol::LinearInterpolFastFieldSerializer,
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
FastFieldStats,
};
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
use prettytable::{Cell, Row, Table};
use rand::prelude::StdRng;
use rand::Rng;
@@ -35,23 +31,16 @@ fn main() {
for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![];
let res = serialize_with_codec::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(&data);
results.push(res);
let res = serialize_with_codec::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(&data);
results.push(res);
let res = serialize_with_codec::<
PiecewiseLinearFastFieldSerializer,
PiecewiseLinearFastFieldReader,
>(&data);
results.push(res);
let res = serialize_with_codec::<FORFastFieldSerializer, FORFastFieldReader>(&data);
results.push(res);
#[cfg(feature = "unstable")]
{
let res = serialize_with_codec::<FORFastFieldSerializer, FORFastFieldReader>(&data);
results.push(res);
}
let res = serialize_with_codec::<
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
BitpackedFastFieldReader,
@@ -168,9 +157,7 @@ pub fn load_float_dataset(file_path: &str) -> Vec<u64> {
for line in lines {
let line_string = line.unwrap();
let value = line_string.parse::<f64>().unwrap();
let bytes = value.to_le_bytes();
let u64_value = u64::from_le_bytes(bytes);
data.push(u64_value);
data.push(f64_to_u64(value));
}
data
}
@@ -202,7 +189,6 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader
.unwrap();
let elasped_time_compression = start_time_compression.elapsed();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
let reader = R::open_from_bytes(&out).unwrap();
let start_time_read = Instant::now();
for doc in 0..data.len() {

View File

@@ -190,8 +190,12 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
}
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
#[deprecated(
note = "MultiLinearInterpol is replaced by PiecewiseLinear codec which fixes the slope and is a little bit more optimized."
)]
pub struct MultiLinearInterpolFastFieldSerializer {}
#[allow(deprecated)]
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
const NAME: &'static str = "MultiLinearInterpol";
const ID: u8 = 3;
@@ -372,6 +376,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
}
#[cfg(test)]
#[allow(deprecated)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;

View File

@@ -1,7 +1,7 @@
/*!
PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to guess values and stores the
difference between the actual value and the one given by the linear interpolation.
PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to predict values
and fast field values. The difference with real fast field values is then stored.
For every block, the linear function can be expressed as
`computed_value = slope * block_position + first_value + positive_offset`
where:
@@ -36,13 +36,10 @@ pub struct PiecewiseLinearFastFieldReader {
block_readers: Vec<BlockReader>,
}
/// Block metadata needed to define the linear function `y = a.x + b`
/// and to bitpack the difference between the real value and the
/// the linear function computed value where:
/// - `a` is the `slope`
/// - `b` is the sum of the `first_value` in the block + an offset
/// `positive_offset` which ensures that difference between the real
/// value and the linear function computed value is always positive.
/// Block that stores metadata to predict value with a linear
/// function `predicted_value = slope * position + first_value + positive_offset`
/// where `positive_offset` is comupted such that predicted values
/// are always positive.
#[derive(Clone, Debug, Default)]
struct BlockMetadata {
first_value: u64,
@@ -72,9 +69,9 @@ impl BlockReader {
let diff = self
.bit_unpacker
.get(block_pos, &data[self.start_offset as usize..]);
let computed_value =
get_computed_value(self.metadata.first_value, block_pos, self.metadata.slope);
(computed_value + diff) - self.metadata.positive_offset
let predicted_value =
predict_value(self.metadata.first_value, block_pos, self.metadata.slope);
(predicted_value + diff) - self.metadata.positive_offset
}
}
@@ -88,13 +85,13 @@ impl BinarySerializable for BlockMetadata {
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let constant = u64::deserialize(reader)?;
let constant_positive_offset = u64::deserialize(reader)?;
let first_value = u64::deserialize(reader)?;
let positive_offset = u64::deserialize(reader)?;
let slope = f32::deserialize(reader)?;
let num_bits = u8::deserialize(reader)?;
Ok(Self {
first_value: constant,
positive_offset: constant_positive_offset,
first_value,
positive_offset,
slope,
num_bits,
})
@@ -172,7 +169,7 @@ impl FastFieldCodecReader for PiecewiseLinearFastFieldReader {
}
#[inline]
fn get_computed_value(first_val: u64, pos: u64, slope: f32) -> u64 {
fn predict_value(first_val: u64, pos: u64, slope: f32) -> u64 {
(first_val as i64 + (pos as f32 * slope) as i64) as u64
}
@@ -205,7 +202,7 @@ impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
let mut positive_offset = 0;
let mut max_delta = 0;
for (pos, &current_value) in block_values[1..].iter().enumerate() {
let computed_value = get_computed_value(first_value, pos as u64 + 1, slope);
let computed_value = predict_value(first_value, pos as u64 + 1, slope);
if computed_value > current_value {
positive_offset = positive_offset.max(computed_value - current_value);
} else {
@@ -214,7 +211,7 @@ impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
}
let num_bits = compute_num_bits(max_delta + positive_offset);
for (pos, current_value) in block_values.iter().enumerate() {
let computed_value = get_computed_value(first_value, pos as u64, slope);
let computed_value = predict_value(first_value, pos as u64, slope);
let diff = (current_value + positive_offset) - computed_value;
bit_packer.write(diff, num_bits, write)?;
}
@@ -282,8 +279,7 @@ impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
let max_distance = sample_positions
.iter()
.map(|&pos| {
let calculated_value =
get_computed_value(first_val_in_first_block, pos as u64, slope);
let calculated_value = predict_value(first_val_in_first_block, pos as u64, slope);
let actual_value = fastfield_accessor.get_val(pos as u64);
distance(calculated_value, actual_value)
})

View File

@@ -392,7 +392,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 9597_usize); // FOR codec size
assert_eq!(file.len(), 12471_usize); // Piecewise linear codec size
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();

View File

@@ -1,5 +1,3 @@
use fastfield_codecs::frame_of_reference::FORFastFieldReader;
use fastfield_codecs::frame_of_reference::FORFastFieldSerializer;
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldReader;
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer;
use std::collections::HashMap;
@@ -10,11 +8,13 @@ use common::BinarySerializable;
use fastfield_codecs::bitpacked::{
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
};
#[allow(deprecated)]
use fastfield_codecs::linearinterpol::{
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
};
#[allow(deprecated)]
use fastfield_codecs::multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader,
};
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
@@ -77,8 +77,6 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
/// Piecewise linear interpolated values + bitpacked
PiecewiseLinear(FastFieldReaderCodecWrapper<Item, PiecewiseLinearFastFieldReader>),
/// Frame of reference values + bitpacked
FOR(FastFieldReaderCodecWrapper<Item, FORFastFieldReader>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
@@ -94,12 +92,14 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
BitpackedReader,
>::open_from_bytes(bytes)?)
}
#[allow(deprecated)]
LinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
Item,
LinearInterpolFastFieldReader,
>::open_from_bytes(bytes)?)
}
#[allow(deprecated)]
MultiLinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
Item,
@@ -114,9 +114,6 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
PiecewiseLinearFastFieldReader,
>::open_from_bytes(bytes)?)
}
FORFastFieldSerializer::ID => DynamicFastFieldReader::FOR(
FastFieldReaderCodecWrapper::<Item, FORFastFieldReader>::open_from_bytes(bytes)?,
),
_ => {
panic!(
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
@@ -136,7 +133,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::LinearInterpol(reader) => reader.get(doc),
Self::MultiLinearInterpol(reader) => reader.get(doc),
Self::PiecewiseLinear(reader) => reader.get(doc),
Self::FOR(reader) => reader.get(doc),
}
}
#[inline]
@@ -146,7 +142,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::LinearInterpol(reader) => reader.get_range(start, output),
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
Self::PiecewiseLinear(reader) => reader.get_range(start, output),
Self::FOR(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
@@ -155,7 +150,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::LinearInterpol(reader) => reader.min_value(),
Self::MultiLinearInterpol(reader) => reader.min_value(),
Self::PiecewiseLinear(reader) => reader.min_value(),
Self::FOR(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
@@ -164,7 +158,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::LinearInterpol(reader) => reader.max_value(),
Self::MultiLinearInterpol(reader) => reader.max_value(),
Self::PiecewiseLinear(reader) => reader.max_value(),
Self::FOR(reader) => reader.max_value(),
}
}
}

View File

@@ -5,9 +5,6 @@ use common::BinarySerializable;
use common::CountingWriter;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
use fastfield_codecs::frame_of_reference::FORFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer;
pub use fastfield_codecs::FastFieldCodecSerializer;
pub use fastfield_codecs::FastFieldDataAccess;
@@ -114,11 +111,6 @@ impl CompositeFastFieldSerializer {
stats.clone(),
&fastfield_accessor,
));
estimations.push(codec_estimation::<FORFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
));
println!("{:?}", estimations);
let best_codec_result = estimations
.iter()
.sorted_by(|result_a, result_b| {
@@ -144,24 +136,6 @@ impl CompositeFastFieldSerializer {
data_iter_2,
)?;
}
LinearInterpolFastFieldSerializer::NAME => {
LinearInterpolFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
MultiLinearInterpolFastFieldSerializer::NAME => {
MultiLinearInterpolFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
PiecewiseLinearFastFieldSerializer::NAME => {
PiecewiseLinearFastFieldSerializer::serialize(
field_write,
@@ -171,15 +145,6 @@ impl CompositeFastFieldSerializer {
data_iter_2,
)?;
}
FORFastFieldSerializer::NAME => {
FORFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
_ => {
panic!("unknown fastfield serializer {}", best_codec_result.name)
}
@@ -285,10 +250,8 @@ mod tests {
// get the codecs id
let mut bytes = directory.open_read(path)?.read_bytes()?;
let codec_id = u8::deserialize(&mut bytes)?;
// Codec id = 1 is bitpacking
assert_eq!(codec_id, 5);
//let reader = FastFieldReaderCodecWrapper::<u64, BitpackedFastFieldReader>::open(file_slice)?;
//assert_eq!(reader.get_u64(0), 0);
// Codec id = 4 is piecewise linear.
assert_eq!(codec_id, 4);
Ok(())
}
}