From e6acf8f76d0267aa107018a025605e54311f2c36 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 10 Nov 2022 20:52:04 +0800 Subject: [PATCH 1/2] add header with codec type for u128 --- fastfield_codecs/src/compact_space/mod.rs | 10 ++++-- fastfield_codecs/src/lib.rs | 41 +++++++++++++++++++++-- fastfield_codecs/src/serialize.rs | 31 +++++++++++++++-- 3 files changed, 75 insertions(+), 7 deletions(-) diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index c6e2da7ea..de211a5ba 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -456,6 +456,7 @@ impl CompactSpaceDecompressor { mod tests { use super::*; + use crate::serialize::U128Header; use crate::{open_u128, serialize_u128}; #[test] @@ -501,7 +502,8 @@ mod tests { assert_eq!(amplitude, 2); } - fn test_all(data: OwnedBytes, expected: &[u128]) { + fn test_all(mut data: OwnedBytes, expected: &[u128]) { + let _header = U128Header::deserialize(&mut data); let decompressor = CompactSpaceDecompressor::open(data).unwrap(); for (idx, expected_val) in expected.iter().cloned().enumerate() { let val = decompressor.get(idx as u32); @@ -556,7 +558,8 @@ mod tests { 4_000_211_222u128, 333u128, ]; - let data = test_aux_vals(vals); + let mut data = test_aux_vals(vals); + let _header = U128Header::deserialize(&mut data); let decomp = CompactSpaceDecompressor::open(data).unwrap(); let complete_range = 0..vals.len() as u32; for (pos, val) in vals.iter().enumerate() { @@ -681,7 +684,8 @@ mod tests { 4_000_211_222u128, 333u128, ]; - let data = test_aux_vals(vals); + let mut data = test_aux_vals(vals); + let _header = U128Header::deserialize(&mut data); let decomp = CompactSpaceDecompressor::open(data).unwrap(); let complete_range = 0..vals.len() as u32; assert_eq!( diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index ef8bd6e0a..b0acc99f4 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -25,7 +25,7 @@ use monotonic_mapping::{ StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval, }; use ownedbytes::OwnedBytes; -use serialize::Header; +use serialize::{Header, U128Header}; mod bitpacked; mod blockwise_linear; @@ -92,10 +92,47 @@ impl FastFieldCodecType { } } +#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] +#[repr(u8)] +/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data. +pub enum U128FastFieldCodecType { + /// This codec takes a large number space (u128) and reduces it to a compact number space, by + /// removing the holes. + CompactSpace = 1, +} + +impl BinarySerializable for U128FastFieldCodecType { + fn serialize(&self, wrt: &mut W) -> io::Result<()> { + self.to_code().serialize(wrt) + } + + fn deserialize(reader: &mut R) -> io::Result { + let code = u8::deserialize(reader)?; + let codec_type: Self = Self::from_code(code) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?; + Ok(codec_type) + } +} + +impl U128FastFieldCodecType { + pub(crate) fn to_code(self) -> u8 { + self as u8 + } + + pub(crate) fn from_code(code: u8) -> Option { + match code { + 1 => Some(Self::CompactSpace), + _ => None, + } + } +} + /// Returns the correct codec reader wrapped in the `Arc` for the data. pub fn open_u128( - bytes: OwnedBytes, + mut bytes: OwnedBytes, ) -> io::Result>> { + let header = U128Header::deserialize(&mut bytes)?; + assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace); let reader = CompactSpaceDecompressor::open(bytes)?; let inverted: StrictlyMonotonicMappingInverter> = StrictlyMonotonicMappingToInternal::::new().into(); diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 1f3041403..94b0cc864 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -35,7 +35,7 @@ use crate::monotonic_mapping::{ }; use crate::{ monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64, - VecColumn, ALL_CODEC_TYPES, + U128FastFieldCodecType, VecColumn, ALL_CODEC_TYPES, }; /// The normalized header gives some parameters after applying the following @@ -98,6 +98,29 @@ impl Header { } } +#[derive(Debug, Copy, Clone)] +pub(crate) struct U128Header { + pub num_vals: u32, + pub codec_type: U128FastFieldCodecType, +} + +impl BinarySerializable for U128Header { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + VInt(self.num_vals as u64).serialize(writer)?; + self.codec_type.serialize(writer)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let num_vals = VInt::deserialize(reader)?.0 as u32; + let codec_type = U128FastFieldCodecType::deserialize(reader)?; + Ok(U128Header { + num_vals, + codec_type, + }) + } +} + pub fn normalize_column( from_column: C, min_value: u64, @@ -167,7 +190,11 @@ pub fn serialize_u128 I, I: Iterator>( num_vals: u32, output: &mut impl io::Write, ) -> io::Result<()> { - // TODO write header, to later support more codecs + let header = U128Header { + num_vals, + codec_type: U128FastFieldCodecType::CompactSpace, + }; + header.serialize(output)?; let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals); compressor.compress_into(iter_gen(), output).unwrap(); From 32166682b35a12d77e86deda974fab18619677b5 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 11 Nov 2022 13:28:12 +0800 Subject: [PATCH 2/2] add header deser test --- fastfield_codecs/src/serialize.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 94b0cc864..b0f9e15da 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -98,7 +98,7 @@ impl Header { } } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub(crate) struct U128Header { pub num_vals: u32, pub codec_type: U128FastFieldCodecType, @@ -285,6 +285,18 @@ pub fn serialize_and_load( mod tests { use super::*; + #[test] + fn test_serialize_deserialize_u128_header() { + let original = U128Header { + num_vals: 11, + codec_type: U128FastFieldCodecType::CompactSpace, + }; + let mut out = Vec::new(); + original.serialize(&mut out).unwrap(); + let restored = U128Header::deserialize(&mut &out[..]).unwrap(); + assert_eq!(restored, original); + } + #[test] fn test_serialize_deserialize() { let original = [1u64, 5u64, 10u64];