diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 3a5ae5876..87e1fd713 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -4,9 +4,9 @@ extern crate test; #[cfg(test)] mod tests { - use fastfield_codecs::bitpacked::{BitpackedReader, BitpackedSerializer}; - use fastfield_codecs::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; - use fastfield_codecs::linear::{LinearReader, LinearSerializer}; + use fastfield_codecs::bitpacked::BitpackedCodec; + use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; + use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::*; fn get_data() -> Vec { @@ -25,16 +25,10 @@ mod tests { fn value_iter() -> impl Iterator { 0..20_000 } - fn bench_get< - S: FastFieldCodecSerializer, - R: FastFieldCodecDeserializer + FastFieldDataAccess, - >( - b: &mut Bencher, - data: &[u64], - ) { + fn bench_get(b: &mut Bencher, data: &[u64]) { let mut bytes = vec![]; - S::serialize(&mut bytes, &data).unwrap(); - let reader = R::open_from_bytes(OwnedBytes::new(bytes)).unwrap(); + Codec::serialize(&mut bytes, &data).unwrap(); + let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap(); b.iter(|| { let mut sum = 0u64; for pos in value_iter() { @@ -45,7 +39,7 @@ mod tests { sum }); } - fn bench_create(b: &mut Bencher, data: &[u64]) { + fn bench_create(b: &mut Bencher, data: &[u64]) { let mut bytes = vec![]; b.iter(|| { S::serialize(&mut bytes, &data).unwrap(); @@ -57,32 +51,32 @@ mod tests { #[bench] fn bench_fastfield_bitpack_create(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_create::(b, &data); + bench_create::(b, &data); } #[bench] fn bench_fastfield_linearinterpol_create(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_create::(b, &data); + bench_create::(b, &data); } #[bench] fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_create::(b, &data); + bench_create::(b, &data); } #[bench] fn bench_fastfield_bitpack_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::(b, &data); + bench_get::(b, &data); } #[bench] fn bench_fastfield_linearinterpol_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::(b, &data); + bench_get::(b, &data); } #[bench] fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { let data: Vec<_> = get_data(); - bench_get::(b, &data); + bench_get::(b, &data); } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { let min_value = data.iter().cloned().min().unwrap_or(0); diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index 43e0ea838..76f9785ec 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -4,9 +4,7 @@ use common::BinarySerializable; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; -use crate::{ - FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, -}; +use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; /// Depending on the field type, a different /// fast field is required. @@ -14,31 +12,11 @@ use crate::{ pub struct BitpackedReader { data: OwnedBytes, bit_unpacker: BitUnpacker, - pub min_value_u64: u64, - pub max_value_u64: u64, - pub num_vals: u64, + min_value_u64: u64, + max_value_u64: u64, + num_vals: u64, } -impl FastFieldCodecDeserializer for BitpackedReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_offset = bytes.len() - 24; - let (data, mut footer) = bytes.split(footer_offset); - let min_value = u64::deserialize(&mut footer)?; - let amplitude = u64::deserialize(&mut footer)?; - let num_vals = u64::deserialize(&mut footer)?; - let max_value = min_value + amplitude; - let num_bits = compute_num_bits(amplitude); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(BitpackedReader { - data, - bit_unpacker, - min_value_u64: min_value, - max_value_u64: max_value, - num_vals, - }) - } -} impl FastFieldDataAccess for BitpackedReader { #[inline] fn get_val(&self, doc: u64) -> u64 { @@ -111,12 +89,33 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> { } } -pub struct BitpackedSerializer {} +pub struct BitpackedCodec; -impl FastFieldCodecSerializer for BitpackedSerializer { +impl FastFieldCodec for BitpackedCodec { /// The CODEC_TYPE is an enum value used for serialization. const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked; + type Reader = BitpackedReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_offset = bytes.len() - 24; + let (data, mut footer) = bytes.split(footer_offset); + let min_value = u64::deserialize(&mut footer)?; + let amplitude = u64::deserialize(&mut footer)?; + let num_vals = u64::deserialize(&mut footer)?; + let max_value = min_value + amplitude; + let num_bits = compute_num_bits(amplitude); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(BitpackedReader { + data, + bit_unpacker, + min_value_u64: min_value, + max_value_u64: max_value, + num_vals, + }) + } + /// Serializes data with the BitpackedFastFieldSerializer. /// /// The serializer in fact encode the values by bitpacking @@ -159,7 +158,7 @@ mod tests { use crate::tests::get_codec_test_data_sets; fn create_and_validate(data: &[u64], name: &str) { - crate::tests::create_and_validate::(data, name); + crate::tests::create_and_validate::(data, name); } #[test] diff --git a/fastfield_codecs/src/blockwise_linear.rs b/fastfield_codecs/src/blockwise_linear.rs index 7db3abc29..6b5e380f7 100644 --- a/fastfield_codecs/src/blockwise_linear.rs +++ b/fastfield_codecs/src/blockwise_linear.rs @@ -18,9 +18,7 @@ use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use crate::linear::{get_calculated_value, get_slope}; -use crate::{ - FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, -}; +use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; const CHUNK_SIZE: u64 = 512; @@ -148,17 +146,6 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio &interpolations[get_interpolation_position(doc)] } -impl FastFieldCodecDeserializer for BlockwiseLinearReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; - let footer_offset = bytes.len() - 4 - footer_len as usize; - let (data, mut footer) = bytes.split(footer_offset); - let footer = BlockwiseLinearFooter::deserialize(&mut footer)?; - Ok(BlockwiseLinearReader { data, footer }) - } -} - impl FastFieldDataAccess for BlockwiseLinearReader { #[inline] fn get_val(&self, idx: u64) -> u64 { @@ -191,10 +178,22 @@ impl FastFieldDataAccess for BlockwiseLinearReader { } /// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements. -pub struct BlockwiseLinearSerializer {} +pub struct BlockwiseLinearCodec; -impl FastFieldCodecSerializer for BlockwiseLinearSerializer { +impl FastFieldCodec for BlockwiseLinearCodec { const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear; + + type Reader = BlockwiseLinearReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; + let footer_offset = bytes.len() - 4 - footer_len as usize; + let (data, mut footer) = bytes.split(footer_offset); + let footer = BlockwiseLinearFooter::deserialize(&mut footer)?; + Ok(BlockwiseLinearReader { data, footer }) + } + /// Creates a new fast field serializer. fn serialize( write: &mut impl Write, @@ -369,9 +368,7 @@ mod tests { use crate::tests::get_codec_test_data_sets; fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::( - data, name, - ) + crate::tests::create_and_validate::(data, name) } const HIGHEST_BIT: u64 = 1 << 63; diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 626a0686c..c12449526 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -12,12 +12,6 @@ pub mod bitpacked; pub mod blockwise_linear; pub mod linear; -pub trait FastFieldCodecDeserializer: Sized { - /// Reads the metadata and returns the CodecReader - fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result - where Self: FastFieldDataAccess; -} - pub trait FastFieldDataAccess { fn get_val(&self, doc: u64) -> u64; fn min_value(&self) -> u64; @@ -69,11 +63,25 @@ impl FastFieldCodecType { /// The FastFieldSerializerEstimate trait is required on all variants /// of fast field compressions, to decide which one to choose. -pub trait FastFieldCodecSerializer { +pub trait FastFieldCodec { /// A codex needs to provide a unique name and id, which is /// used for debugging and de/serialization. const CODEC_TYPE: FastFieldCodecType; + type Reader: FastFieldDataAccess; + + /// Reads the metadata and returns the CodecReader + fn open_from_bytes(bytes: OwnedBytes) -> io::Result; + + /// Serializes the data using the serializer into write. + /// + /// The fastfield_accessor iterator should be preferred over using fastfield_accessor for + /// performance reasons. + fn serialize( + write: &mut impl Write, + fastfield_accessor: &dyn FastFieldDataAccess, + ) -> io::Result<()>; + /// Check if the Codec is able to compress the data fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool; @@ -83,15 +91,6 @@ pub trait FastFieldCodecSerializer { /// It could make sense to also return a value representing /// computational complexity. fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32; - - /// Serializes the data using the serializer into write. - /// - /// The fastfield_accessor iterator should be preferred over using fastfield_accessor for - /// performance reasons. - fn serialize( - write: &mut impl Write, - fastfield_accessor: &dyn FastFieldDataAccess, - ) -> io::Result<()>; } #[derive(Debug, Clone)] @@ -149,27 +148,21 @@ mod tests { use proptest::arbitrary::any; use proptest::proptest; - use crate::bitpacked::{BitpackedReader, BitpackedSerializer}; - use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; - use crate::linear::{LinearReader, LinearSerializer}; + use crate::bitpacked::BitpackedCodec; + use crate::blockwise_linear::BlockwiseLinearCodec; + use crate::linear::LinearCodec; - pub fn create_and_validate< - S: FastFieldCodecSerializer, - R: FastFieldCodecDeserializer + FastFieldDataAccess, - >( - data: &[u64], - name: &str, - ) -> (f32, f32) { - if !S::is_applicable(&data) { + pub fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { + if !Codec::is_applicable(&data) { return (f32::MAX, 0.0); } - let estimation = S::estimate(&data); + let estimation = Codec::estimate(&data); let mut out: Vec = Vec::new(); - S::serialize(&mut out, &data).unwrap(); + Codec::serialize(&mut out, &data).unwrap(); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); - let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap(); + let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap(); assert_eq!(reader.num_vals(), data.len() as u64); for (doc, orig_val) in data.iter().enumerate() { let val = reader.get_val(doc as u64); @@ -186,16 +179,16 @@ mod tests { proptest! { #[test] fn test_proptest_small(data in proptest::collection::vec(any::(), 1..10)) { - create_and_validate::(&data, "proptest linearinterpol"); - create_and_validate::(&data, "proptest multilinearinterpol"); - create_and_validate::(&data, "proptest bitpacked"); + create_and_validate::(&data, "proptest linearinterpol"); + create_and_validate::(&data, "proptest multilinearinterpol"); + create_and_validate::(&data, "proptest bitpacked"); } #[test] fn test_proptest_large(data in proptest::collection::vec(any::(), 1..6000)) { - create_and_validate::(&data, "proptest linearinterpol"); - create_and_validate::(&data, "proptest multilinearinterpol"); - create_and_validate::(&data, "proptest bitpacked"); + create_and_validate::(&data, "proptest linearinterpol"); + create_and_validate::(&data, "proptest multilinearinterpol"); + create_and_validate::(&data, "proptest bitpacked"); } } @@ -216,13 +209,10 @@ mod tests { data_and_names } - fn test_codec< - S: FastFieldCodecSerializer, - R: FastFieldDataAccess + FastFieldCodecDeserializer, - >() { - let codec_name = format!("{:?}", S::CODEC_TYPE); + fn test_codec() { + let codec_name = format!("{:?}", C::CODEC_TYPE); for (data, dataset_name) in get_codec_test_data_sets() { - let (estimate, actual) = crate::tests::create_and_validate::(&data, dataset_name); + let (estimate, actual) = crate::tests::create_and_validate::(&data, dataset_name); let result = if estimate == f32::MAX { "Disabled".to_string() } else { @@ -233,15 +223,15 @@ mod tests { } #[test] fn test_codec_bitpacking() { - test_codec::(); + test_codec::(); } #[test] fn test_codec_interpolation() { - test_codec::(); + test_codec::(); } #[test] fn test_codec_multi_interpolation() { - test_codec::(); + test_codec::(); } use super::*; @@ -250,24 +240,24 @@ mod tests { fn estimation_good_interpolation_case() { let data = (10..=20000_u64).collect::>(); - let linear_interpol_estimation = LinearSerializer::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data); assert_le!(linear_interpol_estimation, 0.01); - let multi_linear_interpol_estimation = BlockwiseLinearSerializer::estimate(&data); + let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data); assert_le!(multi_linear_interpol_estimation, 0.2); assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); - let bitpacked_estimation = BitpackedSerializer::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data); assert_le!(linear_interpol_estimation, bitpacked_estimation); } #[test] fn estimation_test_bad_interpolation_case() { let data = vec![200, 10, 10, 10, 10, 1000, 20]; - let linear_interpol_estimation = LinearSerializer::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data); assert_le!(linear_interpol_estimation, 0.32); - let bitpacked_estimation = BitpackedSerializer::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data); assert_le!(bitpacked_estimation, linear_interpol_estimation); } #[test] @@ -277,10 +267,10 @@ mod tests { // in this case the linear interpolation can't in fact not be worse than bitpacking, // but the estimator adds some threshold, which leads to estimated worse behavior - let linear_interpol_estimation = LinearSerializer::estimate(&data); + let linear_interpol_estimation = LinearCodec::estimate(&data); assert_le!(linear_interpol_estimation, 0.35); - let bitpacked_estimation = BitpackedSerializer::estimate(&data); + let bitpacked_estimation = BitpackedCodec::estimate(&data); assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, linear_interpol_estimation); } diff --git a/fastfield_codecs/src/linear.rs b/fastfield_codecs/src/linear.rs index bf50f7f1b..d2d53143d 100644 --- a/fastfield_codecs/src/linear.rs +++ b/fastfield_codecs/src/linear.rs @@ -5,9 +5,7 @@ use common::{BinarySerializable, FixedSize}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; -use crate::{ - FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess, -}; +use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; /// Depending on the field type, a different /// fast field is required. @@ -59,24 +57,6 @@ impl FixedSize for LinearFooter { const SIZE_IN_BYTES: usize = 56; } -impl FastFieldCodecDeserializer for LinearReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES; - let (data, mut footer) = bytes.split(footer_offset); - let footer = LinearFooter::deserialize(&mut footer)?; - let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals); - let num_bits = compute_num_bits(footer.relative_max_value); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(LinearReader { - data, - bit_unpacker, - footer, - slope, - }) - } -} - impl FastFieldDataAccess for LinearReader { #[inline] fn get_val(&self, doc: u64) -> u64 { @@ -100,7 +80,7 @@ impl FastFieldDataAccess for LinearReader { /// Fastfield serializer, which tries to guess values by linear interpolation /// and stores the difference bitpacked. -pub struct LinearSerializer {} +pub struct LinearCodec; #[inline] pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 { @@ -141,9 +121,27 @@ pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 { } } -impl FastFieldCodecSerializer for LinearSerializer { +impl FastFieldCodec for LinearCodec { const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear; + type Reader = LinearReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES; + let (data, mut footer) = bytes.split(footer_offset); + let footer = LinearFooter::deserialize(&mut footer)?; + let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals); + let num_bits = compute_num_bits(footer.relative_max_value); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(LinearReader { + data, + bit_unpacker, + footer, + slope, + }) + } + /// Creates a new fast field serializer. fn serialize( write: &mut impl Write, @@ -267,7 +265,7 @@ mod tests { use crate::tests::get_codec_test_data_sets; fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { - crate::tests::create_and_validate::(data, name) + crate::tests::create_and_validate::(data, name) } #[test] diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 4f5a1d239..93204cb25 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,8 +1,8 @@ #[macro_use] extern crate prettytable; -use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; -use fastfield_codecs::linear::LinearSerializer; -use fastfield_codecs::{FastFieldCodecSerializer, FastFieldCodecType, FastFieldStats}; +use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; +use fastfield_codecs::linear::LinearCodec; +use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats}; use prettytable::{Cell, Row, Table}; fn main() { @@ -13,11 +13,11 @@ fn main() { for (data, data_set_name) in get_codec_test_data_sets() { let mut results = vec![]; - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::(&data); results.push(res); - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::(&data); results.push(res); - let res = serialize_with_codec::(&data); + let res = serialize_with_codec::(&data); results.push(res); // let best_estimation_codec = results @@ -89,19 +89,19 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { data_and_names } -pub fn serialize_with_codec( +pub fn serialize_with_codec( data: &[u64], ) -> (bool, f32, f32, FastFieldCodecType) { - let is_applicable = S::is_applicable(&data); + let is_applicable = C::is_applicable(&data); if !is_applicable { - return (false, 0.0, 0.0, S::CODEC_TYPE); + return (false, 0.0, 0.0, C::CODEC_TYPE); } - let estimation = S::estimate(&data); + let estimation = C::estimate(&data); let mut out = vec![]; - S::serialize(&mut out, &data).unwrap(); + C::serialize(&mut out, &data).unwrap(); let actual_compression = out.len() as f32 / (data.len() * 8) as f32; - (true, estimation, actual_compression, S::CODEC_TYPE) + (true, estimation, actual_compression, C::CODEC_TYPE) } pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { diff --git a/src/fastfield/gcd.rs b/src/fastfield/gcd.rs index 8e706d12f..37fe38e41 100644 --- a/src/fastfield/gcd.rs +++ b/src/fastfield/gcd.rs @@ -3,7 +3,7 @@ use std::num::NonZeroU64; use common::BinarySerializable; use fastdivide::DividerU64; -use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldDataAccess}; +use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess}; use ownedbytes::OwnedBytes; pub const GCD_DEFAULT: u64 = 1; @@ -12,50 +12,70 @@ pub const GCD_DEFAULT: u64 = 1; /// /// Holds the data and the codec to the read the data. #[derive(Clone)] -pub struct GCDFastFieldCodec { - gcd: u64, - min_value: u64, - num_vals: u64, +pub struct GCDReader { + gcd_params: GCDParams, reader: CodecReader, } -impl FastFieldCodecDeserializer - for GCDFastFieldCodec -{ - fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result { - let footer_offset = bytes.len() - 24; - let (body, mut footer) = bytes.split(footer_offset); - let gcd = u64::deserialize(&mut footer)?; - let min_value = u64::deserialize(&mut footer)?; - let num_vals = u64::deserialize(&mut footer)?; - let reader = C::open_from_bytes(body)?; - Ok(GCDFastFieldCodec { +#[derive(Debug, Clone, Copy)] +struct GCDParams { + gcd: u64, + min_value: u64, + num_vals: u64, +} + +impl GCDParams { + pub fn eval(&self, val: u64) -> u64 { + self.min_value + self.gcd * val + } +} + +impl BinarySerializable for GCDParams { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + self.gcd.serialize(writer)?; + self.min_value.serialize(writer)?; + self.num_vals.serialize(writer)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let gcd: u64 = u64::deserialize(reader)?; + let min_value: u64 = u64::deserialize(reader)?; + let num_vals: u64 = u64::deserialize(reader)?; + Ok(Self { gcd, min_value, num_vals, - reader, }) } } -impl FastFieldDataAccess for GCDFastFieldCodec { +pub fn open_gcd_from_bytes( + bytes: OwnedBytes, +) -> io::Result> { + let footer_offset = bytes.len() - 24; + let (body, mut footer) = bytes.split(footer_offset); + let gcd_params = GCDParams::deserialize(&mut footer)?; + let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?; + Ok(GCDReader { gcd_params, reader }) +} + +impl FastFieldDataAccess for GCDReader { #[inline] fn get_val(&self, doc: u64) -> u64 { - let mut data = self.reader.get_val(doc); - data *= self.gcd; - data += self.min_value; - data + let val = self.reader.get_val(doc); + self.gcd_params.eval(val) } fn min_value(&self) -> u64 { - self.min_value + self.reader.min_value() * self.gcd + self.gcd_params.eval(self.reader.min_value()) } fn max_value(&self) -> u64 { - self.min_value + self.reader.max_value() * self.gcd + self.gcd_params.eval(self.reader.max_value()) } fn num_vals(&self) -> u64 { - self.num_vals + self.gcd_params.num_vals } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 851d5df6a..c76cd7e4e 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -26,7 +26,7 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; -pub(crate) use self::gcd::{find_gcd, GCDFastFieldCodec, GCD_DEFAULT}; +pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT}; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; pub use self::reader::{DynamicFastFieldReader, FastFieldReader}; pub use self::readers::FastFieldReaders; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 7afedf6f5..70da86e64 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -3,15 +3,16 @@ use std::marker::PhantomData; use std::path::Path; use common::BinarySerializable; -use fastfield_codecs::bitpacked::BitpackedReader; -use fastfield_codecs::blockwise_linear::BlockwiseLinearReader; -use fastfield_codecs::linear::LinearReader; -use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldCodecType, FastFieldDataAccess}; +use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader}; +use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader}; +use fastfield_codecs::linear::{LinearCodec, LinearReader}; +use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess}; -use super::{FastValue, GCDFastFieldCodec}; +use super::gcd::open_gcd_from_bytes; +use super::FastValue; use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr}; use crate::error::DataCorruption; -use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter}; +use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader}; use crate::schema::{Schema, FAST}; use crate::DocId; @@ -68,11 +69,11 @@ pub enum DynamicFastFieldReader { BlockwiseLinear(FastFieldReaderCodecWrapper), /// GCD and Bitpacked compressed fastfield data. - BitpackedGCD(FastFieldReaderCodecWrapper>), + BitpackedGCD(FastFieldReaderCodecWrapper>), /// GCD and Linear interpolated values + bitpacked - LinearGCD(FastFieldReaderCodecWrapper>), + LinearGCD(FastFieldReaderCodecWrapper>), /// GCD and Blockwise linear interpolated values + bitpacked - BlockwiseLinearGCD(FastFieldReaderCodecWrapper>), + BlockwiseLinearGCD(FastFieldReaderCodecWrapper>), } impl DynamicFastFieldReader { @@ -83,46 +84,27 @@ impl DynamicFastFieldReader { ) -> crate::Result> { let reader = match codec_type { FastFieldCodecType::Bitpacked => { - DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::< - Item, - BitpackedReader, - >::open_from_bytes(bytes)?) + DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into()) } - FastFieldCodecType::Linear => DynamicFastFieldReader::Linear( - FastFieldReaderCodecWrapper::::open_from_bytes(bytes)?, + FastFieldCodecType::Linear => { + DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into()) + } + FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear( + BlockwiseLinearCodec::open_from_bytes(bytes)?.into(), ), - FastFieldCodecType::BlockwiseLinear => { - DynamicFastFieldReader::BlockwiseLinear(FastFieldReaderCodecWrapper::< - Item, - BlockwiseLinearReader, - >::open_from_bytes(bytes)?) - } FastFieldCodecType::Gcd => { let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; match codec_type { - FastFieldCodecType::Bitpacked => { - DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::< - Item, - GCDFastFieldCodec, - >::open_from_bytes( - bytes - )?) - } - FastFieldCodecType::Linear => { - DynamicFastFieldReader::LinearGCD(FastFieldReaderCodecWrapper::< - Item, - GCDFastFieldCodec, - >::open_from_bytes( - bytes - )?) - } + FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD( + open_gcd_from_bytes::(bytes)?.into(), + ), + FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD( + open_gcd_from_bytes::(bytes)?.into(), + ), FastFieldCodecType::BlockwiseLinear => { - DynamicFastFieldReader::BlockwiseLinearGCD(FastFieldReaderCodecWrapper::< - Item, - GCDFastFieldCodec, - >::open_from_bytes( - bytes - )?) + DynamicFastFieldReader::BlockwiseLinearGCD( + open_gcd_from_bytes::(bytes)?.into(), + ) } FastFieldCodecType::Gcd => { return Err(DataCorruption::comment_only( @@ -199,33 +181,18 @@ pub struct FastFieldReaderCodecWrapper { _phantom: PhantomData, } -impl - FastFieldReaderCodecWrapper +impl From + for FastFieldReaderCodecWrapper { - /// Opens a fast field given a file. - pub fn open(file: FileSlice) -> crate::Result { - let mut bytes = file.read_bytes()?; - let codec_code = bytes.read_u8(); - let codec_type = FastFieldCodecType::from_code(codec_code).ok_or_else(|| { - DataCorruption::comment_only("Unknown codec code does not exist `{codec_code}`") - })?; - assert_eq!( - FastFieldCodecType::Bitpacked, - codec_type, - "Tried to open fast field as bitpacked encoded (id=1), but got serializer with \ - different id" - ); - Self::open_from_bytes(bytes) - } - /// Opens a fast field given the bytes. - pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result { - let reader = C::open_from_bytes(bytes)?; - Ok(FastFieldReaderCodecWrapper { + fn from(reader: CodecReader) -> Self { + FastFieldReaderCodecWrapper { reader, _phantom: PhantomData, - }) + } } +} +impl FastFieldReaderCodecWrapper { #[inline] pub(crate) fn get_u64(&self, doc: u64) -> Item { let data = self.reader.get_val(doc); @@ -251,8 +218,8 @@ impl } } -impl - FastFieldReader for FastFieldReaderCodecWrapper +impl FastFieldReader + for FastFieldReaderCodecWrapper { /// Return the value associated to the given document. /// diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 871a04978..ec5dee4ed 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -3,11 +3,11 @@ use std::num::NonZeroU64; use common::{BinarySerializable, CountingWriter}; use fastdivide::DividerU64; -pub use fastfield_codecs::bitpacked::{BitpackedSerializer, BitpackedSerializerLegacy}; -use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; -use fastfield_codecs::linear::LinearSerializer; +pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy}; +use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec; +use fastfield_codecs::linear::LinearCodec; use fastfield_codecs::FastFieldCodecType; -pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; +pub use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess, FastFieldStats}; use super::{find_gcd, ALL_CODECS, GCD_DEFAULT}; use crate::directory::{CompositeWrite, WritePtr}; @@ -64,15 +64,15 @@ impl From for FastFieldCodecEnableCheck { // use this, when this is merged and stabilized explicit_generic_args_with_impl_trait // https://github.com/rust-lang/rust/pull/86176 -fn codec_estimation( +fn codec_estimation( fastfield_accessor: &A, estimations: &mut Vec<(f32, FastFieldCodecType)>, ) { - if !T::is_applicable(fastfield_accessor) { + if !C::is_applicable(fastfield_accessor) { return; } - let ratio = T::estimate(fastfield_accessor); - estimations.push((ratio, T::CODEC_TYPE)); + let ratio = C::estimate(fastfield_accessor); + estimations.push((ratio, C::CODEC_TYPE)); } impl CompositeFastFieldSerializer { @@ -204,13 +204,13 @@ impl CompositeFastFieldSerializer { let mut estimations = vec![]; if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) { - codec_estimation::(&fastfield_accessor, &mut estimations); + codec_estimation::(&fastfield_accessor, &mut estimations); } if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) { - codec_estimation::(&fastfield_accessor, &mut estimations); + codec_estimation::(&fastfield_accessor, &mut estimations); } if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) { - codec_estimation::(&fastfield_accessor, &mut estimations); + codec_estimation::(&fastfield_accessor, &mut estimations); } if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) { @@ -229,13 +229,13 @@ impl CompositeFastFieldSerializer { Self::write_header(field_write, codec_type)?; match codec_type { FastFieldCodecType::Bitpacked => { - BitpackedSerializer::serialize(field_write, &fastfield_accessor)?; + BitpackedCodec::serialize(field_write, &fastfield_accessor)?; } FastFieldCodecType::Linear => { - LinearSerializer::serialize(field_write, &fastfield_accessor)?; + LinearCodec::serialize(field_write, &fastfield_accessor)?; } FastFieldCodecType::BlockwiseLinear => { - BlockwiseLinearSerializer::serialize(field_write, &fastfield_accessor)?; + BlockwiseLinearCodec::serialize(field_write, &fastfield_accessor)?; } FastFieldCodecType::Gcd => { return Err(io::Error::new(