Compare commits

...

5 Commits

Author SHA1 Message Date
Paul Masurel
e765706487 Removing Deserializer trait
And renaming the `Serializer` trait `FastFieldCodec`.
2022-08-27 21:02:02 +02:00
Pascal Seitz
fdd0f63787 merge traits 2022-08-27 17:01:41 +02:00
Pascal Seitz
fd60e6fe08 rename get_u64 to ge_val 2022-08-27 17:01:41 +02:00
Pascal Seitz
02c3252d1e split open_from_bytes to own trait 2022-08-27 17:01:39 +02:00
Pascal Seitz
4a6f36937c num_vals to FastFieldCodecReader 2022-08-27 17:00:55 +02:00
11 changed files with 296 additions and 292 deletions

View File

@@ -4,9 +4,9 @@ extern crate test;
#[cfg(test)]
mod tests {
use fastfield_codecs::bitpacked::{BitpackedReader, BitpackedSerializer};
use fastfield_codecs::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer};
use fastfield_codecs::linear::{LinearReader, LinearSerializer};
use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::*;
fn get_data() -> Vec<u64> {
@@ -25,27 +25,25 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
b: &mut Bencher,
data: &[u64],
) {
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
S::serialize(&mut bytes, &data).unwrap();
let reader = R::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
Codec::serialize(&mut bytes, &data).unwrap();
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = reader.get_u64(pos as u64);
let val = reader.get_val(pos as u64);
debug_assert_eq!(data[pos as usize], val);
sum = sum.wrapping_add(val);
}
sum
});
}
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = Vec::new();
b.iter(|| {
S::serialize(&mut bytes, &data).unwrap();
bytes.clear();
Codec::serialize(&mut bytes, &data).unwrap();
});
}
@@ -54,32 +52,32 @@ mod tests {
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedSerializer>(b, &data);
bench_create::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearSerializer>(b, &data);
bench_create::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearSerializer>(b, &data);
bench_create::<BlockwiseLinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedSerializer, BitpackedReader>(b, &data);
bench_get::<BitpackedCodec>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearSerializer, LinearReader>(b, &data);
bench_get::<LinearCodec>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearSerializer, BlockwiseLinearReader>(b, &data);
bench_get::<BlockwiseLinearCodec>(b, &data);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);

View File

@@ -4,9 +4,7 @@ use common::BinarySerializable;
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
/// Depending on the field type, a different
/// fast field is required.
@@ -14,29 +12,14 @@ use crate::{
pub struct BitpackedReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
min_value_u64: u64,
max_value_u64: u64,
num_vals: u64,
}
impl FastFieldCodecReader for BitpackedReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
impl FastFieldDataAccess for BitpackedReader {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
}
#[inline]
@@ -47,11 +30,16 @@ impl FastFieldCodecReader for BitpackedReader {
fn max_value(&self) -> u64 {
self.max_value_u64
}
#[inline]
fn num_vals(&self) -> u64 {
self.num_vals
}
}
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_vals: u64,
amplitude: u64,
num_bits: u8,
}
@@ -78,6 +66,7 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
bit_packer,
write,
min_value,
num_vals: 0,
amplitude,
num_bits,
})
@@ -88,22 +77,45 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
self.num_vals += 1;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)?;
self.min_value.serialize(&mut self.write)?;
self.amplitude.serialize(&mut self.write)?;
self.num_vals.serialize(&mut self.write)?;
Ok(())
}
}
pub struct BitpackedSerializer {}
pub struct BitpackedCodec;
impl FastFieldCodecSerializer for BitpackedSerializer {
impl FastFieldCodec for BitpackedCodec {
/// The CODEC_TYPE is an enum value used for serialization.
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;
type Reader = BitpackedReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
})
}
/// Serializes data with the BitpackedFastFieldSerializer.
///
/// The serializer in fact encode the values by bitpacking
@@ -146,7 +158,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedSerializer, BitpackedReader>(data, name);
crate::tests::create_and_validate::<BitpackedCodec>(data, name);
}
#[test]

View File

@@ -18,9 +18,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::linear::{get_calculated_value, get_slope};
use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
const CHUNK_SIZE: u64 = 512;
@@ -148,18 +146,9 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
&interpolations[get_interpolation_position(doc)]
}
impl FastFieldCodecReader for BlockwiseLinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
impl FastFieldDataAccess for BlockwiseLinearReader {
#[inline]
fn get_u64(&self, idx: u64) -> u64 {
fn get_val(&self, idx: u64) -> u64 {
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
let in_block_idx = idx - interpolation.start_pos;
let calculated_value = get_calculated_value(
@@ -182,13 +171,29 @@ impl FastFieldCodecReader for BlockwiseLinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}
/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct BlockwiseLinearSerializer {}
pub struct BlockwiseLinearCodec;
impl FastFieldCodecSerializer for BlockwiseLinearSerializer {
impl FastFieldCodec for BlockwiseLinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;
type Reader = BlockwiseLinearReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
@@ -363,9 +368,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(
data, name,
)
crate::tests::create_and_validate::<BlockwiseLinearCodec, BlockwiseLinearReader>(data, name)
}
const HIGHEST_BIT: u64 = 1 << 63;

View File

@@ -12,12 +12,15 @@ pub mod bitpacked;
pub mod blockwise_linear;
pub mod linear;
pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64) -> u64;
pub trait FastFieldDataAccess {
fn get_val(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
fn num_vals(&self) -> u64;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = u64> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
@@ -60,11 +63,25 @@ impl FastFieldCodecType {
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodecSerializer {
pub trait FastFieldCodec {
/// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization.
const CODEC_TYPE: FastFieldCodecType;
type Reader: FastFieldDataAccess;
/// Reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
/// Serializes the data using the serializer into write.
///
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
/// performance reasons.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()>;
/// Check if the Codec is able to compress the data
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;
@@ -74,40 +91,6 @@ pub trait FastFieldCodecSerializer {
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
/// Serializes the data using the serializer into write.
///
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
/// performance reasons.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()>;
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;
/// Returns a iterator over the data
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_>;
/// min value of the data
fn min_value(&self) -> u64;
/// max value of the data
fn max_value(&self) -> u64;
/// num vals
fn num_vals(&self) -> u64;
}
#[derive(Debug, Clone)]
@@ -165,26 +148,24 @@ mod tests {
use proptest::arbitrary::any;
use proptest::proptest;
use crate::bitpacked::{BitpackedReader, BitpackedSerializer};
use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer};
use crate::linear::{LinearReader, LinearSerializer};
use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::linear::LinearCodec;
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
data: &[u64],
name: &str,
) -> (f32, f32) {
if !S::is_applicable(&data) {
pub fn create_and_validate<Codec: FastFieldCodec>(data: &[u64], name: &str) -> (f32, f32) {
if !Codec::is_applicable(&data) {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data);
let estimation = Codec::estimate(&data);
let mut out: Vec<u8> = Vec::new();
S::serialize(&mut out, &data).unwrap();
Codec::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap();
let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64);
let val = reader.get_val(doc as u64);
if val != *orig_val {
panic!(
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
@@ -198,16 +179,16 @@ mod tests {
proptest! {
#[test]
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
create_and_validate::<LinearSerializer, LinearReader>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedSerializer, BitpackedReader>(&data, "proptest bitpacked");
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
#[test]
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
create_and_validate::<LinearSerializer, LinearReader>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedSerializer, BitpackedReader>(&data, "proptest bitpacked");
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
}
}
@@ -228,10 +209,10 @@ mod tests {
data_and_names
}
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
let codec_name = format!("{:?}", S::CODEC_TYPE);
fn test_codec<C: FastFieldCodec>() {
let codec_name = format!("{:?}", C::CODEC_TYPE);
for (data, dataset_name) in get_codec_test_data_sets() {
let (estimate, actual) = crate::tests::create_and_validate::<S, R>(&data, dataset_name);
let (estimate, actual) = crate::tests::create_and_validate::<C>(&data, dataset_name);
let result = if estimate == f32::MAX {
"Disabled".to_string()
} else {
@@ -242,15 +223,15 @@ mod tests {
}
#[test]
fn test_codec_bitpacking() {
test_codec::<BitpackedSerializer, BitpackedReader>();
test_codec::<BitpackedCodec>();
}
#[test]
fn test_codec_interpolation() {
test_codec::<LinearSerializer, LinearReader>();
test_codec::<LinearCodec>();
}
#[test]
fn test_codec_multi_interpolation() {
test_codec::<BlockwiseLinearSerializer, BlockwiseLinearReader>();
test_codec::<BlockwiseLinearCodec>();
}
use super::*;
@@ -259,24 +240,24 @@ mod tests {
fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>();
let linear_interpol_estimation = LinearSerializer::estimate(&data);
let linear_interpol_estimation = LinearCodec::estimate(&data);
assert_le!(linear_interpol_estimation, 0.01);
let multi_linear_interpol_estimation = BlockwiseLinearSerializer::estimate(&data);
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data);
assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
let bitpacked_estimation = BitpackedSerializer::estimate(&data);
let bitpacked_estimation = BitpackedCodec::estimate(&data);
assert_le!(linear_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data = vec![200, 10, 10, 10, 10, 1000, 20];
let linear_interpol_estimation = LinearSerializer::estimate(&data);
let linear_interpol_estimation = LinearCodec::estimate(&data);
assert_le!(linear_interpol_estimation, 0.32);
let bitpacked_estimation = BitpackedSerializer::estimate(&data);
let bitpacked_estimation = BitpackedCodec::estimate(&data);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
#[test]
@@ -286,10 +267,10 @@ mod tests {
// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation = LinearSerializer::estimate(&data);
let linear_interpol_estimation = LinearCodec::estimate(&data);
assert_le!(linear_interpol_estimation, 0.35);
let bitpacked_estimation = BitpackedSerializer::estimate(&data);
let bitpacked_estimation = BitpackedCodec::estimate(&data);
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}

View File

@@ -5,9 +5,7 @@ use common::{BinarySerializable, FixedSize};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
/// Depending on the field type, a different
/// fast field is required.
@@ -59,24 +57,9 @@ impl FixedSize for LinearFooter {
const SIZE_IN_BYTES: usize = 56;
}
impl FastFieldCodecReader for LinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearReader {
data,
bit_unpacker,
footer,
slope,
})
}
impl FastFieldDataAccess for LinearReader {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
}
@@ -89,11 +72,15 @@ impl FastFieldCodecReader for LinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}
/// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference bitpacked.
pub struct LinearSerializer {}
pub struct LinearCodec;
#[inline]
pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
@@ -134,9 +121,27 @@ pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
}
}
impl FastFieldCodecSerializer for LinearSerializer {
impl FastFieldCodec for LinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear;
type Reader = LinearReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearReader {
data,
bit_unpacker,
footer,
slope,
})
}
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
@@ -260,7 +265,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<LinearSerializer, LinearReader>(data, name)
crate::tests::create_and_validate::<LinearCodec, LinearReader>(data, name)
}
#[test]

View File

@@ -1,8 +1,8 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer;
use fastfield_codecs::linear::LinearSerializer;
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldCodecType, FastFieldStats};
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
use prettytable::{Cell, Row, Table};
fn main() {
@@ -13,11 +13,11 @@ fn main() {
for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![];
let res = serialize_with_codec::<LinearSerializer>(&data);
let res = serialize_with_codec::<LinearCodec>(&data);
results.push(res);
let res = serialize_with_codec::<BlockwiseLinearSerializer>(&data);
let res = serialize_with_codec::<BlockwiseLinearCodec>(&data);
results.push(res);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedSerializer>(&data);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedCodec>(&data);
results.push(res);
// let best_estimation_codec = results
@@ -89,19 +89,19 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
data_and_names
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
pub fn serialize_with_codec<C: FastFieldCodec>(
data: &[u64],
) -> (bool, f32, f32, FastFieldCodecType) {
let is_applicable = S::is_applicable(&data);
let is_applicable = C::is_applicable(&data);
if !is_applicable {
return (false, 0.0, 0.0, S::CODEC_TYPE);
return (false, 0.0, 0.0, C::CODEC_TYPE);
}
let estimation = S::estimate(&data);
let estimation = C::estimate(&data);
let mut out = vec![];
S::serialize(&mut out, &data).unwrap();
C::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
(true, estimation, actual_compression, S::CODEC_TYPE)
(true, estimation, actual_compression, C::CODEC_TYPE)
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -3,7 +3,7 @@ use std::num::NonZeroU64;
use common::BinarySerializable;
use fastdivide::DividerU64;
use fastfield_codecs::FastFieldCodecReader;
use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess};
use ownedbytes::OwnedBytes;
pub const GCD_DEFAULT: u64 = 1;
@@ -12,47 +12,82 @@ pub const GCD_DEFAULT: u64 = 1;
///
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct GCDFastFieldCodec<CodecReader> {
gcd: u64,
min_value: u64,
pub struct GCDReader<CodecReader: FastFieldDataAccess> {
gcd_params: GCDParams,
reader: CodecReader,
}
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
/// Opens a fast field given the bytes.
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (body, mut footer) = bytes.split(footer_offset);
let gcd = u64::deserialize(&mut footer)?;
let min_value = u64::deserialize(&mut footer)?;
let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec {
gcd,
min_value,
reader,
})
}
#[derive(Debug, Clone, Copy)]
struct GCDParams {
gcd: u64,
min_value: u64,
num_vals: u64,
}
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
let mut data = self.reader.get_u64(doc);
data *= self.gcd;
data += self.min_value;
data
}
fn min_value(&self) -> u64 {
self.min_value + self.reader.min_value() * self.gcd
}
fn max_value(&self) -> u64 {
self.min_value + self.reader.max_value() * self.gcd
impl GCDParams {
pub fn eval(&self, val: u64) -> u64 {
self.min_value + self.gcd * val
}
}
pub fn write_gcd_header<W: Write>(field_write: &mut W, min_value: u64, gcd: u64) -> io::Result<()> {
impl BinarySerializable for GCDParams {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.gcd.serialize(writer)?;
self.min_value.serialize(writer)?;
self.num_vals.serialize(writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let gcd: u64 = u64::deserialize(reader)?;
let min_value: u64 = u64::deserialize(reader)?;
let num_vals: u64 = u64::deserialize(reader)?;
Ok(Self {
gcd,
min_value,
num_vals,
})
}
}
pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>(
bytes: OwnedBytes,
) -> io::Result<GCDReader<WrappedCodec::Reader>> {
let footer_offset = bytes.len() - 24;
let (body, mut footer) = bytes.split(footer_offset);
let gcd_params = GCDParams::deserialize(&mut footer)?;
let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?;
Ok(GCDReader { gcd_params, reader })
}
impl<C: FastFieldDataAccess + Clone> FastFieldDataAccess for GCDReader<C> {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
let val = self.reader.get_val(doc);
self.gcd_params.eval(val)
}
fn min_value(&self) -> u64 {
self.gcd_params.eval(self.reader.min_value())
}
fn max_value(&self) -> u64 {
self.gcd_params.eval(self.reader.max_value())
}
fn num_vals(&self) -> u64 {
self.gcd_params.num_vals
}
}
pub fn write_gcd_header<W: Write>(
field_write: &mut W,
min_value: u64,
gcd: u64,
num_vals: u64,
) -> io::Result<()> {
gcd.serialize(field_write)?;
min_value.serialize(field_write)?;
num_vals.serialize(field_write)?;
Ok(())
}

View File

@@ -26,7 +26,7 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub(crate) use self::gcd::{find_gcd, GCDFastFieldCodec, GCD_DEFAULT};
pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT};
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub use self::reader::{DynamicFastFieldReader, FastFieldReader};
pub use self::readers::FastFieldReaders;
@@ -326,7 +326,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 37);
assert_eq!(file.len(), 45);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
@@ -357,7 +357,7 @@ mod tests {
serializer.close()?;
}
let file = directory.open_read(path)?;
assert_eq!(file.len(), 62);
assert_eq!(file.len(), 70);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -393,7 +393,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
assert_eq!(file.len(), 43);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -425,7 +425,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80043);
assert_eq!(file.len(), 80051);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -896,7 +896,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 36);
assert_eq!(file.len(), 44);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
@@ -932,7 +932,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 48);
assert_eq!(file.len(), 56);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
@@ -966,7 +966,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
assert_eq!(file.len(), 43);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;

View File

@@ -3,15 +3,16 @@ use std::marker::PhantomData;
use std::path::Path;
use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedReader;
use fastfield_codecs::blockwise_linear::BlockwiseLinearReader;
use fastfield_codecs::linear::LinearReader;
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecType};
use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader};
use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader};
use fastfield_codecs::linear::{LinearCodec, LinearReader};
use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
use super::{FastValue, GCDFastFieldCodec};
use super::gcd::open_gcd_from_bytes;
use super::FastValue;
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
use crate::error::DataCorruption;
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader};
use crate::schema::{Schema, FAST};
use crate::DocId;
@@ -68,11 +69,11 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>),
/// GCD and Bitpacked compressed fastfield data.
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BitpackedReader>>),
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BitpackedReader>>),
/// GCD and Linear interpolated values + bitpacked
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<LinearReader>>),
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<LinearReader>>),
/// GCD and Blockwise linear interpolated values + bitpacked
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BlockwiseLinearReader>>),
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BlockwiseLinearReader>>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
@@ -83,46 +84,27 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
) -> crate::Result<DynamicFastFieldReader<Item>> {
let reader = match codec_type {
FastFieldCodecType::Bitpacked => {
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
Item,
BitpackedReader,
>::open_from_bytes(bytes)?)
DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into())
}
FastFieldCodecType::Linear => DynamicFastFieldReader::Linear(
FastFieldReaderCodecWrapper::<Item, LinearReader>::open_from_bytes(bytes)?,
FastFieldCodecType::Linear => {
DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
}
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
),
FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinear(FastFieldReaderCodecWrapper::<
Item,
BlockwiseLinearReader,
>::open_from_bytes(bytes)?)
}
FastFieldCodecType::Gcd => {
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
match codec_type {
FastFieldCodecType::Bitpacked => {
DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<BitpackedReader>,
>::open_from_bytes(
bytes
)?)
}
FastFieldCodecType::Linear => {
DynamicFastFieldReader::LinearGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<LinearReader>,
>::open_from_bytes(
bytes
)?)
}
FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
),
FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
),
FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinearGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<BlockwiseLinearReader>,
>::open_from_bytes(
bytes
)?)
DynamicFastFieldReader::BlockwiseLinearGCD(
open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
)
}
FastFieldCodecType::Gcd => {
return Err(DataCorruption::comment_only(
@@ -199,34 +181,21 @@ pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
_phantom: PhantomData<Item>,
}
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> {
/// Opens a fast field given a file.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
let codec_code = bytes.read_u8();
let codec_type = FastFieldCodecType::from_code(codec_code).ok_or_else(|| {
DataCorruption::comment_only("Unknown codec code does not exist `{codec_code}`")
})?;
assert_eq!(
FastFieldCodecType::Bitpacked,
codec_type,
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with \
different id"
);
Self::open_from_bytes(bytes)
}
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = C::open_from_bytes(bytes)?;
Ok(FastFieldReaderCodecWrapper {
impl<Item: FastValue, CodecReader> From<CodecReader>
for FastFieldReaderCodecWrapper<Item, CodecReader>
{
fn from(reader: CodecReader) -> Self {
FastFieldReaderCodecWrapper {
reader,
_phantom: PhantomData,
})
}
}
}
impl<Item: FastValue, D: FastFieldDataAccess> FastFieldReaderCodecWrapper<Item, D> {
#[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item {
let data = self.reader.get_u64(doc);
let data = self.reader.get_val(doc);
Item::from_u64(data)
}
@@ -249,7 +218,7 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
}
}
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
impl<Item: FastValue, C: FastFieldDataAccess + Clone> FastFieldReader<Item>
for FastFieldReaderCodecWrapper<Item, C>
{
/// Return the value associated to the given document.

View File

@@ -3,11 +3,11 @@ use std::num::NonZeroU64;
use common::{BinarySerializable, CountingWriter};
use fastdivide::DividerU64;
pub use fastfield_codecs::bitpacked::{BitpackedSerializer, BitpackedSerializerLegacy};
use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer;
use fastfield_codecs::linear::LinearSerializer;
pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy};
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::FastFieldCodecType;
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
pub use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess, FastFieldStats};
use super::{find_gcd, ALL_CODECS, GCD_DEFAULT};
use crate::directory::{CompositeWrite, WritePtr};
@@ -64,15 +64,15 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
fn codec_estimation<C: FastFieldCodec, A: FastFieldDataAccess>(
fastfield_accessor: &A,
estimations: &mut Vec<(f32, FastFieldCodecType)>,
) {
if !T::is_applicable(fastfield_accessor) {
if !C::is_applicable(fastfield_accessor) {
return;
}
let ratio = T::estimate(fastfield_accessor);
estimations.push((ratio, T::CODEC_TYPE));
let ratio = C::estimate(fastfield_accessor);
estimations.push((ratio, C::CODEC_TYPE));
}
impl CompositeFastFieldSerializer {
@@ -189,7 +189,7 @@ impl CompositeFastFieldSerializer {
field_write,
fastfield_accessor,
)?;
write_gcd_header(field_write, base_value, gcd)?;
write_gcd_header(field_write, base_value, gcd, num_vals)?;
Ok(())
}
@@ -204,13 +204,13 @@ impl CompositeFastFieldSerializer {
let mut estimations = vec![];
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
codec_estimation::<BitpackedSerializer, _>(&fastfield_accessor, &mut estimations);
codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
}
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
codec_estimation::<LinearSerializer, _>(&fastfield_accessor, &mut estimations);
codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
}
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
codec_estimation::<BlockwiseLinearSerializer, _>(&fastfield_accessor, &mut estimations);
codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
}
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{
@@ -229,13 +229,13 @@ impl CompositeFastFieldSerializer {
Self::write_header(field_write, codec_type)?;
match codec_type {
FastFieldCodecType::Bitpacked => {
BitpackedSerializer::serialize(field_write, &fastfield_accessor)?;
BitpackedCodec::serialize(field_write, &fastfield_accessor)?;
}
FastFieldCodecType::Linear => {
LinearSerializer::serialize(field_write, &fastfield_accessor)?;
LinearCodec::serialize(field_write, &fastfield_accessor)?;
}
FastFieldCodecType::BlockwiseLinear => {
BlockwiseLinearSerializer::serialize(field_write, &fastfield_accessor)?;
BlockwiseLinearCodec::serialize(field_write, &fastfield_accessor)?;
}
FastFieldCodecType::Gcd => {
return Err(io::Error::new(

View File

@@ -2,12 +2,13 @@ use std::collections::HashMap;
use std::io;
use common;
use fastfield_codecs::FastFieldDataAccess;
use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::{FastFieldDataAccess, FastFieldType, FastValue};
use super::{FastFieldType, FastValue};
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;