diff --git a/fastfield_codecs/src/bitpacked.rs b/fastfield_codecs/src/bitpacked.rs index a07416998..02f813d62 100644 --- a/fastfield_codecs/src/bitpacked.rs +++ b/fastfield_codecs/src/bitpacked.rs @@ -17,22 +17,7 @@ pub struct BitpackedFastFieldReader { } impl FastFieldCodecReader for BitpackedFastFieldReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_offset = bytes.len() - 16; - let (data, mut footer) = bytes.split(footer_offset); - let min_value = u64::deserialize(&mut footer)?; - let amplitude = u64::deserialize(&mut footer)?; - let max_value = min_value + amplitude; - let num_bits = compute_num_bits(amplitude); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(BitpackedFastFieldReader { - data, - min_value_u64: min_value, - max_value_u64: max_value, - bit_unpacker, - }) - } + #[inline] fn get_u64(&self, doc: u64) -> u64 { self.min_value_u64 + self.bit_unpacker.get(doc, &self.data) @@ -96,11 +81,30 @@ impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> { } } -pub struct BitpackedFastFieldSerializer {} +pub struct BitpackedFastFieldSerializer; impl FastFieldCodecSerializer for BitpackedFastFieldSerializer { const NAME: &'static str = "Bitpacked"; - const ID: u8 = 1; + + type Reader = BitpackedFastFieldReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_offset = bytes.len() - 16; + let (data, mut footer) = bytes.split(footer_offset); + let min_value = u64::deserialize(&mut footer)?; + let amplitude = u64::deserialize(&mut footer)?; + let max_value = min_value + amplitude; + let num_bits = compute_num_bits(amplitude); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(BitpackedFastFieldReader { + data, + min_value_u64: min_value, + max_value_u64: max_value, + bit_unpacker, + }) + } + /// Serializes data with the BitpackedFastFieldSerializer. /// /// The serializer in fact encode the values by bitpacking @@ -146,7 +150,7 @@ mod tests { use crate::tests::get_codec_test_data_sets; fn create_and_validate(data: &[u64], name: &str) { - crate::tests::create_and_validate::( + crate::tests::create_and_validate::( data, name, ); } diff --git a/fastfield_codecs/src/dynamic.rs b/fastfield_codecs/src/dynamic.rs new file mode 100644 index 000000000..fcdbc3642 --- /dev/null +++ b/fastfield_codecs/src/dynamic.rs @@ -0,0 +1,148 @@ +// Copyright (C) 2022 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// + +use std::io; +use std::sync::Arc; + +use ownedbytes::OwnedBytes; + +use crate::FastFieldCodecSerializer; +use crate::bitpacked::BitpackedFastFieldSerializer; +use crate::linearinterpol::LinearInterpolFastFieldSerializer; +use crate::FastFieldCodecReader; +use crate::gcd::GCDFastFieldCodecSerializer; +use crate::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; + +struct DynamicFastFieldSerializer; + +impl FastFieldCodecSerializer for DynamicFastFieldSerializer { + const NAME: &'static str = "dynamic"; + + type Reader = DynamicFastFieldReader; + + fn is_applicable(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> bool { + todo!() + } + + fn estimate(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> f32 { + todo!() + } + + fn serialize( + write: &mut impl io::Write, + fastfield_accessor: &dyn crate::FastFieldDataAccess, + stats: crate::FastFieldStats, + data_iter: impl Iterator, + data_iter1: impl Iterator, + ) -> io::Result<()> { + todo!() + } + + fn open_from_bytes(mut bytes: OwnedBytes) -> io::Result { + let codec_code = bytes.read_u8(); + let codec_type = CodecType::from_code(codec_code).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Unknown codec code `{codec_code}`"), + ) + })?; + let fast_field_reader: Arc = match codec_type { + CodecType::Bitpacked => Arc::new(BitpackedFastFieldSerializer::open_from_bytes(bytes)?), + CodecType::LinearInterpol => { + Arc::new(LinearInterpolFastFieldSerializer::open_from_bytes(bytes)?) + } + CodecType::MultiLinearInterpol => { + Arc::new(MultiLinearInterpolFastFieldSerializer::open_from_bytes(bytes)?) + } + CodecType::Gcd => { + let inner_codec_id = bytes.read_u8(); + let inner_codec_type = CodecType::from_code(inner_codec_id).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Unknown codec code `{codec_code}`"), + ) + })?; + match inner_codec_type { + CodecType::Bitpacked => { + Arc::new(GCDFastFieldCodecSerializer::::open_from_bytes(bytes)?) + } + CodecType::LinearInterpol => { + Arc::new(GCDFastFieldCodecSerializer::::open_from_bytes(bytes)?) + } + CodecType::MultiLinearInterpol => { + Arc::new(GCDFastFieldCodecSerializer::::open_from_bytes(bytes)?) + } + CodecType::Gcd => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "A GCD codec may not wrap another GCD codec.", + )); + } + } + } + }; + Ok(DynamicFastFieldReader(fast_field_reader)) + } +} + + +#[derive(Clone)] +/// DynamicFastFieldReader wraps different readers to access +/// the various encoded fastfield data +pub struct DynamicFastFieldReader(Arc); + +#[repr(u8)] +#[derive(Debug, Clone, Copy)] +enum CodecType { + Bitpacked = 0, + LinearInterpol = 1, + MultiLinearInterpol = 2, + Gcd = 3, +} + +impl CodecType { + pub fn from_code(code: u8) -> Option { + match code { + 0 => Some(CodecType::Bitpacked), + 1 => Some(CodecType::LinearInterpol), + 2 => Some(CodecType::MultiLinearInterpol), + 3 => Some(CodecType::Gcd), + _ => None, + } + } + + pub fn to_code(self) -> u8 { + self as u8 + } +} + +impl FastFieldCodecReader for DynamicFastFieldReader { + fn get_u64(&self, doc: u64) -> u64 { + self.0.get_u64(doc) + } + + fn min_value(&self) -> u64 { + self.0.min_value() + } + + fn max_value(&self) -> u64 { + self.0.max_value() + } +} diff --git a/fastfield_codecs/src/gcd.rs b/fastfield_codecs/src/gcd.rs index 1aed3cee9..d09b49dbe 100644 --- a/fastfield_codecs/src/gcd.rs +++ b/fastfield_codecs/src/gcd.rs @@ -1,44 +1,71 @@ -use std::io::{self, Write}; +use std::{io::{self, Write}, marker::PhantomData, num::NonZeroU64}; use common::BinarySerializable; use fastdivide::DividerU64; use ownedbytes::OwnedBytes; -use crate::FastFieldCodecReader; - -pub const GCD_DEFAULT: u64 = 1; -pub const GCD_CODEC_ID: u8 = 4; +use crate::{FastFieldCodecReader, FastFieldCodecSerializer}; /// Wrapper for accessing a fastfield. /// /// Holds the data and the codec to the read the data. #[derive(Clone)] -pub struct GCDFastFieldCodec { +pub struct GCDFastFieldCodecReader { gcd: u64, min_value: u64, reader: CodecReader, } -impl FastFieldCodecReader for GCDFastFieldCodec { - /// Opens a fast field given the bytes. - fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result { + +pub struct GCDFastFieldCodecSerializer { + _wrapped_type: PhantomData, +} + +impl GCDFastFieldCodecSerializer {} + +impl FastFieldCodecSerializer for GCDFastFieldCodecSerializer { + // TODO Fixme. We could like the underlying codec name as well. + const NAME: &'static str = "GCD"; + + type Reader = GCDFastFieldCodecReader; + + fn is_applicable(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> bool { + todo!() + } + + fn estimate(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> f32 { + todo!() + } + + fn serialize( + write: &mut impl Write, + fastfield_accessor: &dyn crate::FastFieldDataAccess, + stats: crate::FastFieldStats, + data_iter: impl Iterator, + data_iter1: impl Iterator, + ) -> io::Result<()> { + todo!() + } + + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { let footer_offset = bytes.len() - 16; let (body, mut footer) = bytes.split(footer_offset); let gcd = u64::deserialize(&mut footer)?; let min_value = u64::deserialize(&mut footer)?; - let reader = C::open_from_bytes(body)?; - Ok(GCDFastFieldCodec { + let reader = WrappedCodecSerializer::open_from_bytes(body)?; + Ok(GCDFastFieldCodecReader { gcd, min_value, reader, }) } +} + + +impl FastFieldCodecReader for GCDFastFieldCodecReader { #[inline] fn get_u64(&self, doc: u64) -> u64 { - let mut data = self.reader.get_u64(doc); - data *= self.gcd; - data += self.min_value; - data + self.min_value + self.gcd * self.reader.get_u64(doc) } fn min_value(&self) -> u64 { @@ -64,11 +91,13 @@ fn compute_gcd(mut left: u64, mut right: u64) -> u64 { } // Find GCD for iterator of numbers -pub fn find_gcd(numbers: impl Iterator) -> Option { +// +// If all numbers are '0' (or if there are not numbers, return None). +pub fn find_gcd(numbers: impl Iterator) -> Option { let mut numbers = numbers.filter(|n| *n != 0); let mut gcd = numbers.next()?; if gcd == 1 { - return Some(1); + return NonZeroU64::new(gcd); } let mut gcd_divider = DividerU64::divide_by(gcd); @@ -79,151 +108,150 @@ pub fn find_gcd(numbers: impl Iterator) -> Option { } gcd = compute_gcd(gcd, val); if gcd == 1 { - return Some(1); + return NonZeroU64::new(1); } gcd_divider = DividerU64::divide_by(gcd); } - Some(gcd) + NonZeroU64::new(gcd) } #[cfg(test)] mod tests { - /* - TODO Move test + // TODO Move test + // + // use std::collections::HashMap; + // use std::path::Path; + // + // use crate::directory::{CompositeFile, RamDirectory, WritePtr}; + // use crate::fastfield::serializer::FastFieldCodecEnableCheck; + // use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64}; + // use super::{ + // find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecName, + // FastFieldReader, FastFieldsWriter, ALL_CODECS, + // }; + // use crate::schema::Schema; + // use crate::Directory; + // + // fn get_index( + // docs: &[crate::Document], + // schema: &Schema, + // codec_enable_checker: FastFieldCodecEnableCheck, + // ) -> crate::Result { + // let directory: RamDirectory = RamDirectory::create(); + // { + // let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); + // let mut serializer = + // CompositeFastFieldSerializer::from_write_with_codec(write, codec_enable_checker) + // .unwrap(); + // let mut fast_field_writers = FastFieldsWriter::from_schema(schema); + // for doc in docs { + // fast_field_writers.add_document(doc); + // } + // fast_field_writers + // .serialize(&mut serializer, &HashMap::new(), None) + // .unwrap(); + // serializer.close().unwrap(); + // } + // Ok(directory) + // } + // + // fn test_fastfield_gcd_i64_with_codec( + // codec_name: FastFieldCodecName, + // num_vals: usize, + // ) -> crate::Result<()> { + // let path = Path::new("test"); + // let mut docs = vec![]; + // for i in 1..=num_vals { + // let val = i as i64 * 1000i64; + // docs.push(doc!(*FIELDI64=>val)); + // } + // let directory = get_index(&docs, &SCHEMAI64, codec_name.clone().into())?; + // let file = directory.open_read(path).unwrap(); + // assert_eq!(file.len(), 118); + // let composite_file = CompositeFile::open(&file)?; + // let file = composite_file.open_read(*FIELD).unwrap(); + // let fast_field_reader = DynamicFastFieldReader::::open(file)?; + // assert_eq!(fast_field_reader.get(0), 1000i64); + // assert_eq!(fast_field_reader.get(1), 2000i64); + // assert_eq!(fast_field_reader.get(2), 3000i64); + // assert_eq!(fast_field_reader.max_value(), num_vals as i64 * 1000); + // assert_eq!(fast_field_reader.min_value(), 1000i64); + // let file = directory.open_read(path).unwrap(); + // + // Can't apply gcd + // let path = Path::new("test"); + // docs.pop(); + // docs.push(doc!(*FIELDI64=>2001i64)); + // let directory = get_index(&docs, &SCHEMAI64, codec_name.into())?; + // let file2 = directory.open_read(path).unwrap(); + // assert!(file2.len() > file.len()); + // + // Ok(()) + // } + // + // #[test] + // fn test_fastfield_gcd_i64() -> crate::Result<()> { + // for codec_name in ALL_CODECS { + // test_fastfield_gcd_i64_with_codec(codec_name.clone(), 5005)?; + // } + // Ok(()) + // } + // + // fn test_fastfield_gcd_u64_with_codec( + // codec_name: FastFieldCodecName, + // num_vals: usize, + // ) -> crate::Result<()> { + // let path = Path::new("test"); + // let mut docs = vec![]; + // for i in 1..=num_vals { + // let val = i as u64 * 1000u64; + // docs.push(doc!(*FIELD=>val)); + // } + // let directory = get_index(&docs, &SCHEMA, codec_name.clone().into())?; + // let file = directory.open_read(path).unwrap(); + // assert_eq!(file.len(), 118); + // let composite_file = CompositeFile::open(&file)?; + // let file = composite_file.open_read(*FIELD).unwrap(); + // let fast_field_reader = DynamicFastFieldReader::::open(file)?; + // assert_eq!(fast_field_reader.get(0), 1000u64); + // assert_eq!(fast_field_reader.get(1), 2000u64); + // assert_eq!(fast_field_reader.get(2), 3000u64); + // assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000); + // assert_eq!(fast_field_reader.min_value(), 1000u64); + // let file = directory.open_read(path).unwrap(); + // + // Can't apply gcd + // let path = Path::new("test"); + // docs.pop(); + // docs.push(doc!(*FIELDI64=>2001u64)); + // let directory = get_index(&docs, &SCHEMA, codec_name.into())?; + // let file2 = directory.open_read(path).unwrap(); + // assert!(file2.len() > file.len()); + // + // Ok(()) + // } + // + // #[test] + // fn test_fastfield_gcd_u64() -> crate::Result<()> { + // for codec_name in ALL_CODECS { + // test_fastfield_gcd_u64_with_codec(codec_name.clone(), 5005)?; + // } + // Ok(()) + // } + // + // #[test] + // pub fn test_fastfield2() { + // let test_fastfield = DynamicFastFieldReader::::from(vec![100, 200, 300]); + // assert_eq!(test_fastfield.get(0), 100); + // assert_eq!(test_fastfield.get(1), 200); + // assert_eq!(test_fastfield.get(2), 300); + // } - use std::collections::HashMap; - use std::path::Path; + use std::num::NonZeroU64; - use crate::directory::{CompositeFile, RamDirectory, WritePtr}; - use crate::fastfield::serializer::FastFieldCodecEnableCheck; - use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64}; - use super::{ - find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecName, - FastFieldReader, FastFieldsWriter, ALL_CODECS, - }; - use crate::schema::Schema; - use crate::Directory; - - fn get_index( - docs: &[crate::Document], - schema: &Schema, - codec_enable_checker: FastFieldCodecEnableCheck, - ) -> crate::Result { - let directory: RamDirectory = RamDirectory::create(); - { - let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); - let mut serializer = - CompositeFastFieldSerializer::from_write_with_codec(write, codec_enable_checker) - .unwrap(); - let mut fast_field_writers = FastFieldsWriter::from_schema(schema); - for doc in docs { - fast_field_writers.add_document(doc); - } - fast_field_writers - .serialize(&mut serializer, &HashMap::new(), None) - .unwrap(); - serializer.close().unwrap(); - } - Ok(directory) - } - - fn test_fastfield_gcd_i64_with_codec( - codec_name: FastFieldCodecName, - num_vals: usize, - ) -> crate::Result<()> { - let path = Path::new("test"); - let mut docs = vec![]; - for i in 1..=num_vals { - let val = i as i64 * 1000i64; - docs.push(doc!(*FIELDI64=>val)); - } - let directory = get_index(&docs, &SCHEMAI64, codec_name.clone().into())?; - let file = directory.open_read(path).unwrap(); - // assert_eq!(file.len(), 118); - let composite_file = CompositeFile::open(&file)?; - let file = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; - assert_eq!(fast_field_reader.get(0), 1000i64); - assert_eq!(fast_field_reader.get(1), 2000i64); - assert_eq!(fast_field_reader.get(2), 3000i64); - assert_eq!(fast_field_reader.max_value(), num_vals as i64 * 1000); - assert_eq!(fast_field_reader.min_value(), 1000i64); - let file = directory.open_read(path).unwrap(); - - // Can't apply gcd - let path = Path::new("test"); - docs.pop(); - docs.push(doc!(*FIELDI64=>2001i64)); - let directory = get_index(&docs, &SCHEMAI64, codec_name.into())?; - let file2 = directory.open_read(path).unwrap(); - assert!(file2.len() > file.len()); - - Ok(()) - } - - #[test] - fn test_fastfield_gcd_i64() -> crate::Result<()> { - for codec_name in ALL_CODECS { - test_fastfield_gcd_i64_with_codec(codec_name.clone(), 5005)?; - } - Ok(()) - } - - fn test_fastfield_gcd_u64_with_codec( - codec_name: FastFieldCodecName, - num_vals: usize, - ) -> crate::Result<()> { - let path = Path::new("test"); - let mut docs = vec![]; - for i in 1..=num_vals { - let val = i as u64 * 1000u64; - docs.push(doc!(*FIELD=>val)); - } - let directory = get_index(&docs, &SCHEMA, codec_name.clone().into())?; - let file = directory.open_read(path).unwrap(); - // assert_eq!(file.len(), 118); - let composite_file = CompositeFile::open(&file)?; - let file = composite_file.open_read(*FIELD).unwrap(); - let fast_field_reader = DynamicFastFieldReader::::open(file)?; - assert_eq!(fast_field_reader.get(0), 1000u64); - assert_eq!(fast_field_reader.get(1), 2000u64); - assert_eq!(fast_field_reader.get(2), 3000u64); - assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000); - assert_eq!(fast_field_reader.min_value(), 1000u64); - let file = directory.open_read(path).unwrap(); - - // Can't apply gcd - let path = Path::new("test"); - docs.pop(); - docs.push(doc!(*FIELDI64=>2001u64)); - let directory = get_index(&docs, &SCHEMA, codec_name.into())?; - let file2 = directory.open_read(path).unwrap(); - assert!(file2.len() > file.len()); - - Ok(()) - } - - #[test] - fn test_fastfield_gcd_u64() -> crate::Result<()> { - for codec_name in ALL_CODECS { - test_fastfield_gcd_u64_with_codec(codec_name.clone(), 5005)?; - } - Ok(()) - } - - #[test] - pub fn test_fastfield2() { - let test_fastfield = DynamicFastFieldReader::::from(vec![100, 200, 300]); - assert_eq!(test_fastfield.get(0), 100); - assert_eq!(test_fastfield.get(1), 200); - assert_eq!(test_fastfield.get(2), 300); - } - */ - - use crate::gcd::compute_gcd; - use crate::gcd::find_gcd; + use crate::gcd::{compute_gcd, find_gcd}; #[test] fn test_compute_gcd() { @@ -238,16 +266,15 @@ mod tests { assert_eq!(compute_gcd(25, 25), 25); } - #[test] fn find_gcd_test() { assert_eq!(find_gcd([0].into_iter()), None); - assert_eq!(find_gcd([0, 10].into_iter()), Some(10)); - assert_eq!(find_gcd([10, 0].into_iter()), Some(10)); + assert_eq!(find_gcd([0, 10].into_iter()), NonZeroU64::new(10)); + assert_eq!(find_gcd([10, 0].into_iter()), NonZeroU64::new(10)); assert_eq!(find_gcd([].into_iter()), None); - assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), Some(5)); - assert_eq!(find_gcd([15, 16, 10].into_iter()), Some(1)); - assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), Some(5)); - assert_eq!(find_gcd([0, 0].into_iter()), Some(0)); + assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), NonZeroU64::new(5)); + assert_eq!(find_gcd([15, 16, 10].into_iter()), NonZeroU64::new(1)); + assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), NonZeroU64::new(5)); + assert_eq!(find_gcd([0, 0].into_iter()), None); } } diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 5281e14f8..40c58d7fa 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -8,13 +8,13 @@ use std::io::Write; use ownedbytes::OwnedBytes; pub mod bitpacked; +pub mod dynamic; pub mod gcd; pub mod linearinterpol; pub mod multilinearinterpol; -pub trait FastFieldCodecReader: Sized { +pub trait FastFieldCodecReader{ /// reads the metadata and returns the CodecReader - fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result; fn get_u64(&self, doc: u64) -> u64; fn min_value(&self) -> u64; fn max_value(&self) -> u64; @@ -23,10 +23,10 @@ pub trait FastFieldCodecReader: Sized { /// The FastFieldSerializerEstimate trait is required on all variants /// of fast field compressions, to decide which one to choose. pub trait FastFieldCodecSerializer { - /// A codex needs to provide a unique name and id, which is - /// used for debugging and de/serialization. + /// A codex needs to provide a unique name used for debugging and de/serialization. const NAME: &'static str; - const ID: u8; + + type Reader: FastFieldCodecReader; /// Check if the Codec is able to compress the data fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool; @@ -48,6 +48,8 @@ pub trait FastFieldCodecSerializer { data_iter: impl Iterator, data_iter1: impl Iterator, ) -> io::Result<()>; + + fn open_from_bytes(bytes: OwnedBytes) -> io::Result; } /// FastFieldDataAccess is the trait to access fast field data during serialization and estimation. @@ -91,7 +93,7 @@ mod tests { MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, }; - pub fn create_and_validate( + pub fn create_and_validate( data: &[u64], name: &str, ) -> (f32, f32) { @@ -111,7 +113,7 @@ mod tests { let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); - let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap(); + let reader = S::open_from_bytes(OwnedBytes::new(out)).unwrap(); for (doc, orig_val) in data.iter().enumerate() { let val = reader.get_u64(doc as u64); if val != *orig_val { @@ -143,7 +145,7 @@ mod tests { let codec_name = S::NAME; for (data, data_set_name) in get_codec_test_data_sets() { let (estimate, actual) = - crate::tests::create_and_validate::(&data, data_set_name); + crate::tests::create_and_validate::(&data, data_set_name); let result = if estimate == f32::MAX { "Disabled".to_string() } else { diff --git a/fastfield_codecs/src/linearinterpol.rs b/fastfield_codecs/src/linearinterpol.rs index a8ea95672..0911ec4df 100644 --- a/fastfield_codecs/src/linearinterpol.rs +++ b/fastfield_codecs/src/linearinterpol.rs @@ -58,21 +58,7 @@ impl FixedSize for LinearInterpolFooter { } impl FastFieldCodecReader for LinearInterpolFastFieldReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES; - let (data, mut footer) = bytes.split(footer_offset); - let footer = LinearInterpolFooter::deserialize(&mut footer)?; - let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals); - let num_bits = compute_num_bits(footer.relative_max_value); - let bit_unpacker = BitUnpacker::new(num_bits); - Ok(LinearInterpolFastFieldReader { - data, - bit_unpacker, - footer, - slope, - }) - } + #[inline] fn get_u64(&self, doc: u64) -> u64 { let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope); @@ -110,7 +96,25 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 { impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer { const NAME: &'static str = "LinearInterpol"; - const ID: u8 = 2; + + type Reader = LinearInterpolFastFieldReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES; + let (data, mut footer) = bytes.split(footer_offset); + let footer = LinearInterpolFooter::deserialize(&mut footer)?; + let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals); + let num_bits = compute_num_bits(footer.relative_max_value); + let bit_unpacker = BitUnpacker::new(num_bits); + Ok(LinearInterpolFastFieldReader { + data, + bit_unpacker, + footer, + slope, + }) + } + /// Creates a new fast field serializer. fn serialize( write: &mut impl Write, @@ -240,7 +244,6 @@ mod tests { fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { crate::tests::create_and_validate::< LinearInterpolFastFieldSerializer, - LinearInterpolFastFieldReader, >(data, name) } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 18fef5c60..361a81379 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,7 +1,7 @@ #[macro_use] extern crate prettytable; -use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; -use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; +// use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer; +// use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer; use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats}; use prettytable::{Cell, Row, Table}; @@ -12,11 +12,11 @@ fn main() { table.add_row(row!["", "Compression Ratio", "Compression Estimation"]); for (data, data_set_name) in get_codec_test_data_sets() { - let mut results = vec![]; - let res = serialize_with_codec::(&data); - results.push(res); - let res = serialize_with_codec::(&data); - results.push(res); + let mut results = Vec::new(); + // let res = serialize_with_codec::(&data); + // results.push(res); + // let res = serialize_with_codec::(&data); + // results.push(res); let res = serialize_with_codec::( &data, ); diff --git a/fastfield_codecs/src/multilinearinterpol.rs b/fastfield_codecs/src/multilinearinterpol.rs index 26b7c9e88..31b08040d 100644 --- a/fastfield_codecs/src/multilinearinterpol.rs +++ b/fastfield_codecs/src/multilinearinterpol.rs @@ -146,15 +146,6 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio } impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader { - /// Opens a fast field given a file. - fn open_from_bytes(bytes: OwnedBytes) -> io::Result { - let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; - let footer_offset = bytes.len() - 4 - footer_len as usize; - let (data, mut footer) = bytes.split(footer_offset); - let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?; - Ok(MultiLinearInterpolFastFieldReader { data, footer }) - } - #[inline] fn get_u64(&self, doc: u64) -> u64 { let interpolation = get_interpolation_function(doc, &self.footer.interpolations); @@ -192,7 +183,18 @@ pub struct MultiLinearInterpolFastFieldSerializer {} impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer { const NAME: &'static str = "MultiLinearInterpol"; - const ID: u8 = 3; + + type Reader = MultiLinearInterpolFastFieldReader; + + /// Opens a fast field given a file. + fn open_from_bytes(bytes: OwnedBytes) -> io::Result { + let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?; + let footer_offset = bytes.len() - 4 - footer_len as usize; + let (data, mut footer) = bytes.split(footer_offset); + let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?; + Ok(MultiLinearInterpolFastFieldReader { data, footer }) + } + /// Creates a new fast field serializer. fn serialize( write: &mut impl Write, @@ -374,7 +376,6 @@ mod tests { fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { crate::tests::create_and_validate::< MultiLinearInterpolFastFieldSerializer, - MultiLinearInterpolFastFieldReader, >(data, name) } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 6f28be0aa..3addc19ca 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -5,7 +5,7 @@ use std::path::Path; use fastfield_codecs::bitpacked::{ BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer, }; -use fastfield_codecs::gcd::{GCDFastFieldCodec, GCD_CODEC_ID}; +use fastfield_codecs::gcd::{GCDFastFieldCodecReader, GCD_CODEC_ID}; use fastfield_codecs::linearinterpol::{ LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer, }; @@ -73,14 +73,14 @@ pub enum DynamicFastFieldReader { MultiLinearInterpol(FastFieldReaderCodecWrapper), /// GCD and Bitpacked compressed fastfield data. - BitpackedGCD(FastFieldReaderCodecWrapper>), + BitpackedGCD(FastFieldReaderCodecWrapper>), /// GCD and Linear interpolated values + bitpacked LinearInterpolGCD( - FastFieldReaderCodecWrapper>, + FastFieldReaderCodecWrapper>, ), /// GCD and Blockwise linear interpolated values + bitpacked MultiLinearInterpolGCD( - FastFieldReaderCodecWrapper>, + FastFieldReaderCodecWrapper>, ), } @@ -118,7 +118,7 @@ impl DynamicFastFieldReader { BitpackedFastFieldSerializer::ID => { DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::< Item, - GCDFastFieldCodec, + GCDFastFieldCodecReader, >::open_from_bytes( bytes )?) @@ -126,7 +126,7 @@ impl DynamicFastFieldReader { LinearInterpolFastFieldSerializer::ID => { DynamicFastFieldReader::LinearInterpolGCD(FastFieldReaderCodecWrapper::< Item, - GCDFastFieldCodec, + GCDFastFieldCodecReader, >::open_from_bytes( bytes )?) @@ -135,7 +135,7 @@ impl DynamicFastFieldReader { DynamicFastFieldReader::MultiLinearInterpolGCD( FastFieldReaderCodecWrapper::< Item, - GCDFastFieldCodec, + GCDFastFieldCodecReader, >::open_from_bytes(bytes)?, ) } diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs index a4626250b..ec5181e50 100644 --- a/src/fastfield/serializer/mod.rs +++ b/src/fastfield/serializer/mod.rs @@ -1,4 +1,5 @@ use std::io::{self, Write}; +use std::num::NonZeroU64; use common::{BinarySerializable, CountingWriter}; pub use fastfield_codecs::bitpacked::{ @@ -141,7 +142,8 @@ impl CompositeFastFieldSerializer { let field_write = self.composite_write.for_field_with_idx(field, idx); let gcd = find_gcd(iter_gen().map(|val| val - stats.min_value)).unwrap_or(GCD_DEFAULT); - if gcd <= 1 { + if gcd == 1 { + // No GCD opportunity here. return Self::create_auto_detect_u64_fast_field_with_idx_gcd( self.codec_enable_checker.clone(), field, @@ -157,7 +159,7 @@ impl CompositeFastFieldSerializer { struct GCDWrappedFFAccess { fastfield_accessor: T, min_value: u64, - gcd: u64, + gcd: NonZeroU64, } impl FastFieldDataAccess for GCDWrappedFFAccess { fn get_val(&self, position: u64) -> u64 { diff --git a/src/fastfield/wrapper.rs b/src/fastfield/wrapper.rs new file mode 100644 index 000000000..125dd6618 --- /dev/null +++ b/src/fastfield/wrapper.rs @@ -0,0 +1,117 @@ +// Copyright (C) 2022 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . +// + +/// Wrapper for accessing a fastfield. +/// +/// Holds the data and the codec to the read the data. +#[derive(Clone)] +pub struct FastFieldReaderCodecWrapper { + reader: CodecReader, + _phantom: PhantomData, +} + +impl FastFieldReader + for FastFieldReaderCodecWrapper +{ + /// Return the value associated to the given document. + /// + /// This accessor should return as fast as possible. + /// + /// # Panics + /// + /// May panic if `doc` is greater than the segment + // `maxdoc`. + fn get(&self, doc: DocId) -> Item { + self.get_u64(u64::from(doc)) + } + + /// Fills an output buffer with the fast field values + /// associated with the `DocId` going from + /// `start` to `start + output.len()`. + /// + /// Regardless of the type of `Item`, this method works + /// - transmuting the output array + /// - extracting the `Item`s as if they were `u64` + /// - possibly converting the `u64` value to the right type. + /// + /// # Panics + /// + /// May panic if `start + output.len()` is greater than + /// the segment's `maxdoc`. + fn get_range(&self, start: u64, output: &mut [Item]) { + self.get_range_u64(start, output); + } + + /// Returns the minimum value for this fast field. + /// + /// The max value does not take in account of possible + /// deleted document, and should be considered as an upper bound + /// of the actual maximum value. + fn min_value(&self) -> Item { + Item::from_u64(self.reader.min_value()) + } + + /// Returns the maximum value for this fast field. + /// + /// The max value does not take in account of possible + /// deleted document, and should be considered as an upper bound + /// of the actual maximum value. + fn max_value(&self) -> Item { + Item::from_u64(self.reader.max_value()) + } +} + +impl From> for DynamicFastFieldReader { + fn from(vals: Vec) -> DynamicFastFieldReader { + let mut schema_builder = Schema::builder(); + let field = schema_builder.add_u64_field("field", FAST); + let schema = schema_builder.build(); + let path = Path::new("__dummy__"); + let directory: RamDirectory = RamDirectory::create(); + { + let write: WritePtr = directory + .open_write(path) + .expect("With a RamDirectory, this should never fail."); + let mut serializer = CompositeFastFieldSerializer::from_write(write) + .expect("With a RamDirectory, this should never fail."); + let mut fast_field_writers = FastFieldsWriter::from_schema(&schema); + { + let fast_field_writer = fast_field_writers + .get_field_writer_mut(field) + .expect("With a RamDirectory, this should never fail."); + for val in vals { + fast_field_writer.add_val(val.to_u64()); + } + } + fast_field_writers + .serialize(&mut serializer, &HashMap::new(), None) + .unwrap(); + serializer.close().unwrap(); + } + + let file = directory.open_read(path).expect("Failed to open the file"); + let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file"); + let field_file = composite_file + .open_read(field) + .expect("File component not found"); + DynamicFastFieldReader::open(field_file).unwrap() + } +} +