diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs new file mode 100644 index 000000000..ecdd60a73 --- /dev/null +++ b/fastfield_codecs/src/bitpacked.rs @@ -0,0 +1,187 @@ +use crate::CodecId; +use crate::FastFieldDataAccess; +use crate::FastFieldSerializerEstimate; +use crate::FastFieldStats; +use common::BinarySerializable; +use std::io::{self, Write}; +use tantivy_bitpacker::compute_num_bits; +use tantivy_bitpacker::BitPacker; + +use tantivy_bitpacker::BitUnpacker; + +/// Depending on the field type, a different +/// fast field is required. +#[derive(Clone)] +pub struct BitpackedFastFieldReader { + bit_unpacker: BitUnpacker, + pub min_value_u64: u64, + pub max_value_u64: u64, +} + +impl<'data> BitpackedFastFieldReader { + /// Opens a fast field given a file. + pub fn open_from_bytes(bytes: &[u8]) -> io::Result { + let (_data, mut footer) = bytes.split_at(bytes.len() - 16); + let min_value = u64::deserialize(&mut footer)?; + let amplitude = u64::deserialize(&mut footer)?; + let max_value = min_value + amplitude; + let num_bits = compute_num_bits(amplitude); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(BitpackedFastFieldReader { + min_value_u64: min_value, + max_value_u64: max_value, + bit_unpacker, + }) + } + pub fn get_u64(&self, doc: u64, data: &[u8]) -> u64 { + self.min_value_u64 + self.bit_unpacker.get(doc, &data) + } +} +pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> { + bit_packer: BitPacker, + write: &'a mut W, + min_value: u64, + amplitude: u64, + num_bits: u8, +} + +impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> { + /// Creates a new fast field serializer. + /// + /// The serializer in fact encode the values by bitpacking + /// `(val - min_value)`. + /// + /// It requires a `min_value` and a `max_value` to compute + /// compute the minimum number of bits required to encode + /// values. + pub fn open( + write: &'a mut W, + min_value: u64, + max_value: u64, + ) -> io::Result> { + assert!(min_value <= max_value); + let amplitude = max_value - min_value; + let num_bits = compute_num_bits(amplitude); + let bit_packer = BitPacker::new(); + Ok(BitpackedFastFieldSerializer { + bit_packer, + write, + min_value, + amplitude, + num_bits, + }) + } + /// Creates a new fast field serializer. + /// + /// The serializer in fact encode the values by bitpacking + /// `(val - min_value)`. + /// + /// It requires a `min_value` and a `max_value` to compute + /// compute the minimum number of bits required to encode + /// values. + pub fn create( + write: &'a mut W, + _fastfield_accessor: &impl FastFieldDataAccess, + stats: FastFieldStats, + data_iter: impl Iterator, + ) -> io::Result<()> { + let mut serializer = Self::open(write, stats.min_value, stats.max_value)?; + + for val in data_iter { + serializer.add_val(val)?; + } + serializer.close_field()?; + + Ok(()) + } + /// Pushes a new value to the currently open u64 fast field. + #[inline] + pub fn add_val(&mut self, val: u64) -> io::Result<()> { + let val_to_write: u64 = val - self.min_value; + self.bit_packer + .write(val_to_write, self.num_bits, &mut self.write)?; + Ok(()) + } + pub fn close_field(mut self) -> io::Result<()> { + self.bit_packer.close(&mut self.write)?; + self.min_value.serialize(&mut self.write)?; + self.amplitude.serialize(&mut self.write)?; + Ok(()) + } +} + +impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> { + fn estimate( + _fastfield_accessor: &impl FastFieldDataAccess, + stats: FastFieldStats, + ) -> (f32, &'static str) { + let amplitude = stats.max_value - stats.min_value; + let num_bits = compute_num_bits(amplitude); + let num_bits_uncompressed = 64; + let ratio = num_bits as f32 / num_bits_uncompressed as f32; + let name = Self::NAME; + (ratio, name) + } +} +impl<'a, W: 'a + Write> CodecId for BitpackedFastFieldSerializer<'_, W> { + const NAME: &'static str = "Bitpacked"; + const ID: u8 = 1; +} + +#[cfg(test)] +mod tests { + use super::*; + + fn create_and_validate(data: &[u64]) { + let mut out = vec![]; + BitpackedFastFieldSerializer::create( + &mut out, + &data, + crate::tests::stats_from_vec(&data), + data.iter().cloned(), + ) + .unwrap(); + + let reader = BitpackedFastFieldReader::open_from_bytes(&out).unwrap(); + for (doc, val) in data.iter().enumerate() { + assert_eq!(reader.get_u64(doc as u64, &out), *val); + } + } + + #[test] + fn bitpacked_fast_field_test_simple() { + let data = (10..=20_u64).collect::>(); + + create_and_validate(&data); + } + + #[test] + fn bitpacked_fast_field_test_with_offset() { + //let data = vec![5, 50, 95, 96, 97, 98, 99, 100]; + let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100]; + create_and_validate(&data); + + data.reverse(); + create_and_validate(&data); + } + #[test] + fn bitpacked_fast_field_test_no_structure() { + let mut data = vec![5, 50, 3, 13, 1, 1000, 35]; + create_and_validate(&data); + + data.reverse(); + create_and_validate(&data); + } + #[test] + fn bitpacked_fast_field_rand() { + for _ in 0..500 { + let mut data = (0..1 + rand::random::() as usize) + .map(|_| rand::random::() as u64 / 2 as u64) + .collect::>(); + create_and_validate(&data); + + data.reverse(); + create_and_validate(&data); + } + } +} diff --git a/fastfield_codecs/src/bitpacked/mod.rs b/fastfield_codecs/src/bitpacked/mod.rs deleted file mode 100644 index 9dba9f32e..000000000 --- a/fastfield_codecs/src/bitpacked/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -mod reader; -mod serialize; - -pub use reader::BitpackedFastFieldReader; -pub use serialize::BitpackedFastFieldSerializer; diff --git a/fastfield_codecs/src/bitpacked/reader.rs b/fastfield_codecs/src/bitpacked/reader.rs deleted file mode 100644 index 966c9dc44..000000000 --- a/fastfield_codecs/src/bitpacked/reader.rs +++ /dev/null @@ -1,33 +0,0 @@ -use common::BinarySerializable; -use std::io; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitUnpacker; -/// Depending on the field type, a different -/// fast field is required. -#[derive(Clone)] -pub struct BitpackedFastFieldReader<'data> { - bytes: &'data [u8], - bit_unpacker: BitUnpacker, - pub min_value_u64: u64, - pub max_value_u64: u64, -} - -impl<'data> BitpackedFastFieldReader<'data> { - /// Opens a fast field given a file. - pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result { - let min_value = u64::deserialize(&mut bytes)?; - let amplitude = u64::deserialize(&mut bytes)?; - let max_value = min_value + amplitude; - let num_bits = compute_num_bits(amplitude); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(BitpackedFastFieldReader { - bytes, - min_value_u64: min_value, - max_value_u64: max_value, - bit_unpacker, - }) - } - pub fn get_u64(&self, doc: u64) -> u64 { - self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes) - } -} diff --git a/fastfield_codecs/src/bitpacked/serialize.rs b/fastfield_codecs/src/bitpacked/serialize.rs deleted file mode 100644 index 40b74ea1d..000000000 --- a/fastfield_codecs/src/bitpacked/serialize.rs +++ /dev/null @@ -1,93 +0,0 @@ -use crate::FastFieldDataAccess; -use crate::FastFieldSerializerEstimate; -use crate::FastFieldStats; -use common::BinarySerializable; -use std::io::{self, Write}; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; - -pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> { - bit_packer: BitPacker, - write: &'a mut W, - min_value: u64, - num_bits: u8, -} - -impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> { - /// Creates a new fast field serializer. - /// - /// The serializer in fact encode the values by bitpacking - /// `(val - min_value)`. - /// - /// It requires a `min_value` and a `max_value` to compute - /// compute the minimum number of bits required to encode - /// values. - pub fn open( - write: &'a mut W, - min_value: u64, - max_value: u64, - ) -> io::Result> { - assert!(min_value <= max_value); - min_value.serialize(write)?; - let amplitude = max_value - min_value; - amplitude.serialize(write)?; - let num_bits = compute_num_bits(amplitude); - let bit_packer = BitPacker::new(); - Ok(BitpackedFastFieldSerializer { - bit_packer, - write, - min_value, - num_bits, - }) - } - /// Creates a new fast field serializer. - /// - /// The serializer in fact encode the values by bitpacking - /// `(val - min_value)`. - /// - /// It requires a `min_value` and a `max_value` to compute - /// compute the minimum number of bits required to encode - /// values. - pub fn create( - write: &'a mut W, - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - data_iter: impl Iterator, - ) -> io::Result<()> { - let mut serializer = Self::open(write, stats.min_value, stats.max_value)?; - - for val in data_iter { - serializer.add_val(val)?; - } - serializer.close_field()?; - - Ok(()) - } - /// Pushes a new value to the currently open u64 fast field. - pub fn add_val(&mut self, val: u64) -> io::Result<()> { - let val_to_write: u64 = val - self.min_value; - self.bit_packer - .write(val_to_write, self.num_bits, &mut self.write)?; - Ok(()) - } - pub fn close_field(mut self) -> io::Result<()> { - self.bit_packer.close(&mut self.write) - } -} - -impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> { - fn estimate( - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - ) -> (f32, &'static str) { - let amplitude = stats.max_value - stats.min_value; - let num_bits = compute_num_bits(amplitude); - let num_bits_uncompressed = 64; - let ratio = num_bits as f32 / num_bits_uncompressed as f32; - let name = Self::codec_id().0; - (ratio, name) - } - fn codec_id() -> (&'static str, u8) { - ("Bitpacked", 1) - } -} diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 83a310488..659bb02f6 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -35,8 +35,15 @@ pub trait FastFieldSerializerEstimate { fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats, ) -> (f32, &'static str); - /// the unique (name, id) of the compressor. Used to distinguish when de/serializing. - fn codec_id() -> (&'static str, u8); +} + +/// `CodecId` is required by each Codec. +/// +/// It needs to provide a unique name and id, which is +/// used for debugging and de/serialization. +pub trait CodecId { + const NAME: &'static str; + const ID: u8; } #[derive(Debug, Clone)] @@ -45,3 +52,15 @@ pub struct FastFieldStats { pub max_value: u64, pub num_vals: u64, } + +impl<'a> FastFieldDataAccess for &'a [u64] { + fn get(&self, doc: u32) -> u64 { + self[doc as usize] + } +} + +impl FastFieldDataAccess for Vec { + fn get(&self, doc: u32) -> u64 { + self[doc as usize] + } +} diff --git a/fastfield_codecs/src/linearinterpol.rs b/fastfield_codecs/src/linearinterpol.rs new file mode 100644 index 000000000..759a7d2f3 --- /dev/null +++ b/fastfield_codecs/src/linearinterpol.rs @@ -0,0 +1,232 @@ +use crate::CodecId; +use crate::FastFieldDataAccess; +use crate::FastFieldSerializerEstimate; +use crate::FastFieldStats; +use std::io::{self, Read, Write}; +use tantivy_bitpacker::compute_num_bits; +use tantivy_bitpacker::BitPacker; + +use common::BinarySerializable; +use common::FixedSize; +use tantivy_bitpacker::BitUnpacker; + +/// Depending on the field type, a different +/// fast field is required. +#[derive(Clone)] +pub struct LinearinterpolFastFieldReader { + bit_unpacker: BitUnpacker, + pub footer: LinearInterpolFooter, + pub slope: f64, +} + +#[derive(Clone, Debug)] +pub struct LinearInterpolFooter { + pub relative_max_value: u64, + pub offset: u64, + pub first_val: u64, + pub last_val: u64, + pub num_vals: u64, +} + +impl BinarySerializable for LinearInterpolFooter { + fn serialize(&self, write: &mut W) -> io::Result<()> { + self.relative_max_value.serialize(write)?; + self.offset.serialize(write)?; + self.first_val.serialize(write)?; + self.last_val.serialize(write)?; + self.num_vals.serialize(write)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + Ok(LinearInterpolFooter { + relative_max_value: u64::deserialize(reader)?, + offset: u64::deserialize(reader)?, + first_val: u64::deserialize(reader)?, + last_val: u64::deserialize(reader)?, + num_vals: u64::deserialize(reader)?, + }) + } +} + +impl FixedSize for LinearInterpolFooter { + const SIZE_IN_BYTES: usize = 40; +} + +impl LinearinterpolFastFieldReader { + /// Opens a fast field given a file. + pub fn open_from_bytes(bytes: &[u8]) -> io::Result { + let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES); + let footer = LinearInterpolFooter::deserialize(&mut footer)?; + //let rel_max_value = u64::deserialize(&mut footer)?; + //let offset = u64::deserialize(&mut footer)?; + //let first_value = u64::deserialize(&mut footer)?; + //let last_value = u64::deserialize(&mut footer)?; + //let num_vals = u64::deserialize(&mut footer)?; + let slope = (footer.last_val as f64 - footer.first_val as f64) + / (footer.num_vals as u64 - 1) as f64; + + let num_bits = compute_num_bits(footer.relative_max_value); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(LinearinterpolFastFieldReader { + footer, + bit_unpacker, + slope, + }) + } + pub fn get_u64(&self, doc: u64, data: &[u8]) -> u64 { + let calculated_value = self.footer.first_val + (doc as f64 * self.slope) as u64; + (calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset + } +} + +/// Fastfield serializer, which tries to guess values by linear interpolation +/// and stores the difference bitpacked. +pub struct LinearInterpolFastFieldSerializer {} + +// TODO not suitable if max is larger than i64::MAX / 2 + +impl LinearInterpolFastFieldSerializer { + /// Creates a new fast field serializer. + pub fn create( + write: &mut impl Write, + fastfield_accessor: &impl FastFieldDataAccess, + stats: FastFieldStats, + data_iter: impl Iterator, + data_iter1: impl Iterator, + data_iter2: impl Iterator, + ) -> io::Result<()> { + assert!(stats.min_value <= stats.max_value); + + let first_val = fastfield_accessor.get(0); + let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1); + let slope = get_slope(first_val, last_val, stats.num_vals); + // todo walk over data just once and calulate offset on the fly + // offset to ensure all values are positive + let offset = data_iter1 + .enumerate() + .map(|(pos, val)| { + let calculated_value = first_val + (pos as f64 * slope) as u64; + val as i64 - calculated_value as i64 + }) + .min() + .unwrap() + .abs() as u64; + + //calc new max + let rel_max = data_iter2 + .enumerate() + .map(|(pos, val)| { + let calculated_value = first_val + (pos as f64 * slope) as u64; + (val + offset) - calculated_value + }) + .max() + .unwrap(); + + let amplitude = rel_max; + let num_bits = compute_num_bits(amplitude); + let mut bit_packer = BitPacker::new(); + for (pos, val) in data_iter.enumerate() { + let calculated_value = first_val + (pos as f64 * slope) as u64; + let diff = (val + offset) - calculated_value; + bit_packer.write(diff, num_bits, write)?; + } + bit_packer.close(write)?; + + let footer = LinearInterpolFooter { + relative_max_value: amplitude, + offset, + first_val, + last_val, + num_vals: stats.num_vals, + }; + footer.serialize(write)?; + Ok(()) + } +} +fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f64 { + (last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64 +} + +impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer { + fn estimate( + _fastfield_accessor: &impl FastFieldDataAccess, + stats: FastFieldStats, + ) -> (f32, &'static str) { + let amplitude = stats.max_value - stats.min_value; + let num_bits = compute_num_bits(amplitude); + let num_bits_uncompressed = 64; + let ratio = num_bits as f32 / num_bits_uncompressed as f32; + let name = Self::NAME; + (ratio, name) + } +} + +impl CodecId for LinearInterpolFastFieldSerializer { + const NAME: &'static str = "LinearInterpol"; + const ID: u8 = 2; +} + +#[cfg(test)] +mod tests { + use super::*; + + fn create_and_validate(data: &[u64]) -> (u64, u64) { + let mut out = vec![]; + LinearInterpolFastFieldSerializer::create( + &mut out, + &data, + crate::tests::stats_from_vec(&data), + data.iter().cloned(), + data.iter().cloned(), + data.iter().cloned(), + ) + .unwrap(); + + let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap(); + for (doc, val) in data.iter().enumerate() { + assert_eq!(reader.get_u64(doc as u64, &out), *val); + } + (reader.footer.relative_max_value, reader.footer.offset) + } + + #[test] + fn linear_interpol_fast_field_test_simple() { + let data = (10..=20_u64).collect::>(); + + let (rel_max_value, offset) = create_and_validate(&data); + + assert_eq!(offset, 0); + assert_eq!(rel_max_value, 0); + } + + #[test] + fn linear_interpol_fast_field_test_with_offset() { + //let data = vec![5, 50, 95, 96, 97, 98, 99, 100]; + let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100]; + create_and_validate(&data); + + data.reverse(); + create_and_validate(&data); + } + #[test] + fn linear_interpol_fast_field_test_no_structure() { + let mut data = vec![5, 50, 3, 13, 1, 1000, 35]; + create_and_validate(&data); + + data.reverse(); + create_and_validate(&data); + } + #[test] + fn linear_interpol_fast_field_rand() { + for _ in 0..500 { + let mut data = (0..1 + rand::random::() as usize) + .map(|_| rand::random::() as u64 / 2 as u64) + .collect::>(); + create_and_validate(&data); + + data.reverse(); + create_and_validate(&data); + } + } +} diff --git a/fastfield_codecs/src/linearinterpol/mod.rs b/fastfield_codecs/src/linearinterpol/mod.rs deleted file mode 100644 index 60d9f1e6e..000000000 --- a/fastfield_codecs/src/linearinterpol/mod.rs +++ /dev/null @@ -1,123 +0,0 @@ -mod serialize; - -use common::BinarySerializable; -use std::io; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitUnpacker; - -use crate::FastFieldDataAccess; -/// Depending on the field type, a different -/// fast field is required. -#[derive(Clone)] -pub struct LinearinterpolFastFieldReader<'data> { - bytes: &'data [u8], - bit_unpacker: BitUnpacker, - pub first_value: u64, - pub rel_max_value: u64, - pub offset: u64, - pub slope: f64, -} - -impl<'data> LinearinterpolFastFieldReader<'data> { - /// Opens a fast field given a file. - pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result { - let rel_max_value = u64::deserialize(&mut bytes)?; - let offset = u64::deserialize(&mut bytes)?; - let first_value = u64::deserialize(&mut bytes)?; - let last_value = u64::deserialize(&mut bytes)?; - let num_vals = u64::deserialize(&mut bytes)?; - let slope = (last_value as f64 - first_value as f64) / (num_vals as u64 - 1) as f64; - - let num_bits = compute_num_bits(rel_max_value); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(LinearinterpolFastFieldReader { - bytes, - first_value, - rel_max_value, - offset, - bit_unpacker, - slope, - }) - } - pub fn get_u64(&self, doc: u64) -> u64 { - let calculated_value = self.first_value + (doc as f64 * self.slope) as u64; - calculated_value + self.bit_unpacker.get(doc, &self.bytes) - self.offset - - //self.offset + self.min_value + self.bit_unpacker.get(doc, &self.bytes) - } -} - -impl<'a> FastFieldDataAccess for &'a [u64] { - fn get(&self, doc: u32) -> u64 { - self[doc as usize] - } -} - -impl FastFieldDataAccess for Vec { - fn get(&self, doc: u32) -> u64 { - self[doc as usize] - } -} -#[cfg(test)] -mod tests { - use super::*; - - fn create_and_validate(data: &[u64]) -> (u64, u64) { - let mut out = vec![]; - serialize::LinearInterpolFastFieldSerializer::create( - &mut out, - &data, - crate::tests::stats_from_vec(&data), - data.iter().cloned(), - data.iter().cloned(), - data.iter().cloned(), - ) - .unwrap(); - - let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap(); - for (doc, val) in data.iter().enumerate() { - assert_eq!(reader.get_u64(doc as u64), *val); - } - (reader.rel_max_value, reader.offset) - } - - #[test] - fn linear_interpol_fast_field_test_simple() { - let data = (10..=20_u64).collect::>(); - - let (rel_max_value, offset) = create_and_validate(&data); - - assert_eq!(offset, 0); - assert_eq!(rel_max_value, 0); - } - - #[test] - fn linear_interpol_fast_field_test_with_offset() { - //let data = vec![5, 50, 95, 96, 97, 98, 99, 100]; - let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100]; - create_and_validate(&data); - - data.reverse(); - create_and_validate(&data); - } - #[test] - fn linear_interpol_fast_field_test_no_structure() { - let mut data = vec![5, 50, 3, 13, 1, 1000, 35]; - create_and_validate(&data); - - data.reverse(); - create_and_validate(&data); - } - #[test] - fn linear_interpol_fast_field_rand() { - for i in 0..50000 { - let mut data = (0..1 + rand::random::() as usize) - .map(|num| rand::random::() as u64 / 2 as u64) - .collect::>(); - create_and_validate(&data); - - data.reverse(); - create_and_validate(&data); - } - } -} diff --git a/fastfield_codecs/src/linearinterpol/serialize.rs b/fastfield_codecs/src/linearinterpol/serialize.rs deleted file mode 100644 index 0c99bfde0..000000000 --- a/fastfield_codecs/src/linearinterpol/serialize.rs +++ /dev/null @@ -1,87 +0,0 @@ -use crate::FastFieldDataAccess; -use crate::FastFieldSerializerEstimate; -use crate::FastFieldStats; -use common::BinarySerializable; -use std::io::{self, Write}; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; - -/// Fastfield serializer, which tries to guess values by linear interpolation -/// and stores the difference. -pub struct LinearInterpolFastFieldSerializer {} - -// TODO not suitable if max is larger than i64::MAX / 2 - -impl LinearInterpolFastFieldSerializer { - /// Creates a new fast field serializer. - pub fn create( - write: &mut impl Write, - fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - data_iter: impl Iterator, - data_iter1: impl Iterator, - data_iter2: impl Iterator, - ) -> io::Result<()> { - assert!(stats.min_value <= stats.max_value); - - //let first_val = stats.min_value; - //let last_val = stats.max_value; - let first_val = fastfield_accessor.get(0); - let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1); - let slope = (last_val as f64 - first_val as f64) / (stats.num_vals as u64 - 1) as f64; - // offset to ensure all values are positive - let offset = data_iter1 - .enumerate() - .map(|(pos, val)| { - let calculated_value = first_val + (pos as f64 * slope) as u64; - val as i64 - calculated_value as i64 - }) - .min() - .unwrap() - .abs() as u64; - - //calc new max - let rel_max = data_iter2 - .enumerate() - .map(|(pos, val)| { - let calculated_value = first_val + (pos as f64 * slope) as u64; - (val + offset) - calculated_value - }) - .max() - .unwrap(); - - let amplitude = rel_max; - amplitude.serialize(write)?; - offset.serialize(write)?; - first_val.serialize(write)?; - last_val.serialize(write)?; - stats.num_vals.serialize(write)?; - let num_bits = compute_num_bits(amplitude); - let mut bit_packer = BitPacker::new(); - for (pos, val) in data_iter.enumerate() { - let calculated_value = first_val + (pos as f64 * slope) as u64; - let diff = (val + offset) - calculated_value; - bit_packer.write(diff, num_bits, write)?; - } - bit_packer.close(write)?; - - Ok(()) - } -} - -impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer { - fn estimate( - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - ) -> (f32, &'static str) { - let amplitude = stats.max_value - stats.min_value; - let num_bits = compute_num_bits(amplitude); - let num_bits_uncompressed = 64; - let ratio = num_bits as f32 / num_bits_uncompressed as f32; - let name = Self::codec_id().0; - (ratio, name) - } - fn codec_id() -> (&'static str, u8) { - ("LinearInterpol", 2) - } -} diff --git a/src/directory/owned_bytes.rs b/src/directory/owned_bytes.rs index 1983d41c8..d8f493416 100644 --- a/src/directory/owned_bytes.rs +++ b/src/directory/owned_bytes.rs @@ -56,12 +56,6 @@ impl OwnedBytes { self.data } - /// Returns the underlying slice of data. - /// `Deref` and `AsRef` are also available. - #[inline] - pub fn into_slice(self) -> &'static [u8] { - self.data - } /// Returns the len of the slice. #[inline] pub fn len(&self) -> usize { diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 5f36a1d4d..3e86198ac 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -107,7 +107,8 @@ impl FastFieldReader for DynamicFastFieldReader { /// fast field is required. #[derive(Clone)] pub struct BitpackedFastFieldReader { - reader: BitpackedReader<'static>, + reader: BitpackedReader, + bytes: OwnedBytes, _phantom: PhantomData, } @@ -118,16 +119,17 @@ impl BitpackedFastFieldReader { let _id = u8::deserialize(&mut bytes)?; Self::open_from_bytes(bytes) } - /// Opens a fast field given a file. + /// Opens a fast field given the bytes. pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result { - let reader = BitpackedReader::open_from_bytes(bytes.into_slice())?; + let reader = BitpackedReader::open_from_bytes(bytes.as_slice())?; Ok(BitpackedFastFieldReader { reader, + bytes, _phantom: PhantomData, }) } pub(crate) fn get_u64(&self, doc: u64) -> Item { - Item::from_u64(self.reader.get_u64(doc)) + Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice())) } /// Internally `multivalued` also use SingleValue Fast fields. diff --git a/src/fastfield/serializer/bitpacked.rs b/src/fastfield/serializer/bitpacked.rs deleted file mode 100644 index 8dc6e9c4f..000000000 --- a/src/fastfield/serializer/bitpacked.rs +++ /dev/null @@ -1,97 +0,0 @@ -use super::FastFieldDataAccess; -use super::FastFieldSerializer; -use super::FastFieldSerializerEstimate; -use super::FastFieldStats; -use crate::common::BinarySerializable; -use std::io::{self, Write}; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; - -pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> { - bit_packer: BitPacker, - write: &'a mut W, - min_value: u64, - num_bits: u8, -} - -impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> { - /// Creates a new fast field serializer. - /// - /// The serializer in fact encode the values by bitpacking - /// `(val - min_value)`. - /// - /// It requires a `min_value` and a `max_value` to compute - /// compute the minimum number of bits required to encode - /// values. - pub(crate) fn open( - write: &'a mut W, - min_value: u64, - max_value: u64, - ) -> io::Result> { - assert!(min_value <= max_value); - min_value.serialize(write)?; - let amplitude = max_value - min_value; - amplitude.serialize(write)?; - let num_bits = compute_num_bits(amplitude); - let bit_packer = BitPacker::new(); - Ok(BitpackedFastFieldSerializer { - bit_packer, - write, - min_value, - num_bits, - }) - } - /// Creates a new fast field serializer. - /// - /// The serializer in fact encode the values by bitpacking - /// `(val - min_value)`. - /// - /// It requires a `min_value` and a `max_value` to compute - /// compute the minimum number of bits required to encode - /// values. - pub(crate) fn create( - write: &'a mut W, - fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - data_iter: impl Iterator, - ) -> io::Result<()> { - let mut serializer = Self::open(write, stats.min_value, stats.max_value)?; - - for val in data_iter { - serializer.add_val(val)?; - } - serializer.close_field()?; - - Ok(()) - } -} - -impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> { - /// Pushes a new value to the currently open u64 fast field. - fn add_val(&mut self, val: u64) -> io::Result<()> { - let val_to_write: u64 = val - self.min_value; - self.bit_packer - .write(val_to_write, self.num_bits, &mut self.write)?; - Ok(()) - } - fn close_field(mut self) -> io::Result<()> { - self.bit_packer.close(&mut self.write) - } -} - -impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> { - fn estimate( - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - ) -> (f32, &'static str) { - let amplitude = stats.max_value - stats.min_value; - let num_bits = compute_num_bits(amplitude); - let num_bits_uncompressed = 64; - let ratio = num_bits as f32 / num_bits_uncompressed as f32; - let name = Self::codec_id().0; - (ratio, name) - } - fn codec_id() -> (&'static str, u8) { - ("Bitpacked", 1) - } -} diff --git a/src/fastfield/serializer/linearinterpol.rs b/src/fastfield/serializer/linearinterpol.rs deleted file mode 100644 index 888a33146..000000000 --- a/src/fastfield/serializer/linearinterpol.rs +++ /dev/null @@ -1,78 +0,0 @@ -use super::FastFieldDataAccess; -use super::FastFieldSerializerEstimate; -use super::FastFieldStats; -use crate::common::BinarySerializable; -use std::io::{self, Write}; -use tantivy_bitpacker::compute_num_bits; -use tantivy_bitpacker::BitPacker; - -/// Fastfield serializer, which tries to guess values by linear interpolation -/// and stores the difference. -pub struct LinearInterpolFastFieldSerializer {} - -impl LinearInterpolFastFieldSerializer { - /// Creates a new fast field serializer. - pub(crate) fn create( - write: &mut impl Write, - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - data_iter: impl Iterator, - data_iter1: impl Iterator, - data_iter2: impl Iterator, - ) -> io::Result<()> { - assert!(stats.min_value <= stats.max_value); - - let step = (stats.max_value - stats.min_value) as f64 / (stats.num_vals as u64 - 1) as f64; - // offset to ensure all values are positive - let offset = data_iter1 - .enumerate() - .map(|(pos, val)| { - let calculated_value = stats.min_value + (pos as f64 * step) as u64; - val as i64 - calculated_value as i64 - }) - .min() - .unwrap() - .abs() as u64; - - //calc new max - let rel_max = data_iter2 - .enumerate() - .map(|(pos, val)| { - let calculated_value = stats.min_value + (pos as f64 * step) as u64; - (val + offset) - calculated_value - }) - .max() - .unwrap(); - - stats.min_value.serialize(write)?; - let amplitude = rel_max; - amplitude.serialize(write)?; - offset.serialize(write)?; - stats.min_value.serialize(write)?; - let num_bits = compute_num_bits(amplitude); - let mut bit_packer = BitPacker::new(); - for val in data_iter { - bit_packer.write(val, num_bits, write)?; - } - bit_packer.close(write)?; - - Ok(()) - } -} - -impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer { - fn estimate( - _fastfield_accessor: &impl FastFieldDataAccess, - stats: FastFieldStats, - ) -> (f32, &'static str) { - let amplitude = stats.max_value - stats.min_value; - let num_bits = compute_num_bits(amplitude); - let num_bits_uncompressed = 64; - let ratio = num_bits as f32 / num_bits_uncompressed as f32; - let name = Self::codec_id().0; - (ratio, name) - } - fn codec_id() -> (&'static str, u8) { - ("LinearInterpol", 2) - } -} diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index 14050a217..26669d6e3 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -1,12 +1,9 @@ -mod bitpacked; -mod linearinterpol; - use crate::common::BinarySerializable; use crate::common::CompositeWrite; use crate::common::CountingWriter; use crate::directory::WritePtr; use crate::schema::Field; -use crate::DocId; +use fastfield_codecs::CodecId; //pub use bitpacked::BitpackedFastFieldSerializer; pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer; pub use fastfield_codecs::FastFieldDataAccess; @@ -57,13 +54,14 @@ impl CompositeFastFieldSerializer { ) -> io::Result<()> { let field_write = self.composite_write.for_field_with_idx(field, 0); - let (_ratio, (name, id)) = ( + let (_ratio, name, id) = ( BitpackedFastFieldSerializer::>::estimate(&fastfield_accessor, stats.clone()), - BitpackedFastFieldSerializer::>::codec_id(), + BitpackedFastFieldSerializer::>::NAME, + BitpackedFastFieldSerializer::>::ID, ); id.serialize(field_write)?; - if name == BitpackedFastFieldSerializer::>::codec_id().0 { + if name == BitpackedFastFieldSerializer::>::NAME { BitpackedFastFieldSerializer::create( field_write, &fastfield_accessor, @@ -97,7 +95,7 @@ impl CompositeFastFieldSerializer { ) -> io::Result>> { let field_write = self.composite_write.for_field_with_idx(field, idx); // Prepend codec id to field data for compatibility with DynamicFastFieldReader. - let (_name, id) = BitpackedFastFieldSerializer::>::codec_id(); + let id = BitpackedFastFieldSerializer::>::ID; id.serialize(field_write)?; BitpackedFastFieldSerializer::open(field_write, min_value, max_value) }