diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 38f39fa1a..63479494b 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -36,6 +36,8 @@ pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { u128::from_be_bytes(ip_addr_v6.octets()) } +const NULL_VALUE_COMPACT_SPACE: u64 = 0; + /// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of /// blanks depends on the number of blanks. /// @@ -45,7 +47,6 @@ const COST_PER_BLANK_IN_BITS: usize = 36; #[derive(Debug, Clone, Eq, PartialEq)] pub struct CompactSpace { ranges_mapping: Vec, - pub null_value: u128, } /// Maps the range from the original space to compact_start + range.len() @@ -58,11 +59,15 @@ impl RangeMapping { fn range_length(&self) -> u64 { (self.value_range.end() - self.value_range.start()) as u64 + 1 } + + // The last value of the compact space in this range + fn compact_end(&self) -> u64 { + self.compact_start + self.range_length() - 1 + } } impl BinarySerializable for CompactSpace { fn serialize(&self, writer: &mut W) -> io::Result<()> { - VIntU128(self.null_value).serialize(writer)?; VInt(self.ranges_mapping.len() as u64).serialize(writer)?; let mut prev_value = 0; @@ -84,12 +89,11 @@ impl BinarySerializable for CompactSpace { } fn deserialize(reader: &mut R) -> io::Result { - let null_value = VIntU128::deserialize(reader)?.0; - let num_values = VInt::deserialize(reader)?.0; + let num_ranges = VInt::deserialize(reader)?.0; let mut ranges_mapping: Vec = vec![]; let mut value = 0u128; - let mut compact_start = 0u64; - for _ in 0..num_values { + let mut compact_start = 1u64; // 0 is reserved for `null` + for _ in 0..num_ranges { let blank_delta_start = VIntU128::deserialize(reader)?.0; value += blank_delta_start; let blank_start = value; @@ -102,15 +106,12 @@ impl BinarySerializable for CompactSpace { value_range: blank_start..=blank_end, compact_start, }; - let compact_delta = range_mapping.range_length(); + let range_length = range_mapping.range_length(); ranges_mapping.push(range_mapping); - compact_start += compact_delta as u64; + compact_start += range_length as u64; } - Ok(Self { - null_value, - ranges_mapping, - }) + Ok(Self { ranges_mapping }) } } @@ -120,8 +121,10 @@ impl CompactSpace { /// /// It's only used to verify we don't exceed u64 number space, which would indicate a bug. fn amplitude_compact_space(&self) -> u128 { - let last_range = &self.ranges_mapping[self.ranges_mapping.len() - 1]; - last_range.compact_start as u128 + last_range.range_length() as u128 + self.ranges_mapping + .last() + .map(|last_range| last_range.compact_end() as u128 + 1) + .unwrap_or(1) // compact space starts at 1, 0 == null } fn get_range_mapping(&self, pos: usize) -> &RangeMapping { @@ -171,7 +174,6 @@ pub struct CompactSpaceCompressor { pub struct IPCodecParams { compact_space: CompactSpace, bit_unpacker: BitUnpacker, - null_value_compact_space: u64, min_value: u128, max_value: u128, num_vals: u64, @@ -179,10 +181,6 @@ pub struct IPCodecParams { } impl CompactSpaceCompressor { - pub fn null_value_compact_space(&self) -> u64 { - self.params.null_value_compact_space - } - /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. pub fn train_from( vals: impl Iterator, @@ -225,7 +223,7 @@ impl CompactSpaceCompressor { ) })? } else { - self.null_value_compact_space() + NULL_VALUE_COMPACT_SPACE }; bitpacker.write(compact, self.params.num_bits, write)?; } @@ -237,9 +235,6 @@ impl CompactSpaceCompressor { fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpaceCompressor { let compact_space = get_compact_space(values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); - let null_compact_space = compact_space - .to_compact(compact_space.null_value) - .expect("could not convert null_value to compact space"); let amplitude_compact_space = compact_space.amplitude_compact_space(); assert!( @@ -250,10 +245,9 @@ fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpac let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64); let min_value = *values_sorted.iter().next().unwrap_or(&0); let max_value = *values_sorted.iter().last().unwrap_or(&0); - let max_value_in_value_space = max_value.max(compact_space.null_value); assert!( compact_space - .to_compact(max_value_in_value_space) + .to_compact(max_value) .expect("could not convert max value to compact space") < amplitude_compact_space as u64 ); @@ -261,7 +255,6 @@ fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpac params: IPCodecParams { compact_space, bit_unpacker: BitUnpacker::new(num_bits), - null_value_compact_space: null_compact_space, min_value, max_value, num_vals: total_num_values as u64, @@ -282,7 +275,6 @@ impl BinarySerializable for IPCodecParams { let footer_flags = 0u64; footer_flags.serialize(writer)?; - VIntU128(self.null_value_compact_space as u128).serialize(writer)?; VIntU128(self.min_value).serialize(writer)?; VIntU128(self.max_value).serialize(writer)?; VIntU128(self.num_vals as u128).serialize(writer)?; @@ -295,7 +287,6 @@ impl BinarySerializable for IPCodecParams { fn deserialize(reader: &mut R) -> io::Result { let _header_flags = u64::deserialize(reader)?; - let null_value_compact_space = VIntU128::deserialize(reader)?.0 as u64; let min_value = VIntU128::deserialize(reader)?.0; let max_value = VIntU128::deserialize(reader)?.0; let num_vals = VIntU128::deserialize(reader)?.0 as u64; @@ -305,7 +296,6 @@ impl BinarySerializable for IPCodecParams { Ok(Self { compact_space, bit_unpacker: BitUnpacker::new(num_bits), - null_value_compact_space, min_value, max_value, num_vals, @@ -406,7 +396,7 @@ impl CompactSpaceDecompressor { // Get end of previous range let pos = pos - 1; let range_mapping = self.params.compact_space.get_range_mapping(pos); - range_mapping.compact_start + range_mapping.range_length() + range_mapping.compact_end() }); let range = compact_from..=compact_to; @@ -415,8 +405,8 @@ impl CompactSpaceDecompressor { let step_size = 4; let cutoff = self.params.num_vals - self.params.num_vals % step_size; - let mut check_add = |idx, val| { - if range.contains(&val) && val != self.params.null_value_compact_space { + let mut add_if_in_range = |idx, val| { + if range.contains(&val) { positions.push(idx); } }; @@ -431,15 +421,15 @@ impl CompactSpaceDecompressor { let val2 = get_val(idx2); let val3 = get_val(idx3); let val4 = get_val(idx4); - check_add(idx1, val1); - check_add(idx2, val2); - check_add(idx3, val3); - check_add(idx4, val4); + add_if_in_range(idx1, val1); + add_if_in_range(idx2, val2); + add_if_in_range(idx3, val3); + add_if_in_range(idx4, val4); } // handle rest for idx in cutoff..self.params.num_vals { - check_add(idx, get_val(idx)); + add_if_in_range(idx, get_val(idx)); } positions @@ -456,7 +446,7 @@ impl CompactSpaceDecompressor { // TODO: Performance. It would be better to iterate on the ranges and check existence via // the bit_unpacker. self.iter_compact().map(|compact| { - if compact == self.params.null_value_compact_space { + if compact == NULL_VALUE_COMPACT_SPACE { None } else { Some(self.compact_to_u128(compact)) @@ -467,7 +457,7 @@ impl CompactSpaceDecompressor { #[inline] pub fn get(&self, idx: u64) -> Option { let compact = self.params.bit_unpacker.get(idx, &self.data); - if compact == self.params.null_value_compact_space { + if compact == NULL_VALUE_COMPACT_SPACE { None } else { Some(self.compact_to_u128(compact)) @@ -496,13 +486,19 @@ mod tests { .into_iter() .collect(); let compact_space = get_compact_space(ips, ips.len(), 11); - assert_eq!(compact_space.null_value, 5); let amplitude = compact_space.amplitude_compact_space(); assert_eq!(amplitude, 20); - assert_eq!(2, compact_space.to_compact(2).unwrap()); - assert_eq!(3, compact_space.to_compact(3).unwrap()); + assert_eq!(3, compact_space.to_compact(2).unwrap()); + assert_eq!(4, compact_space.to_compact(3).unwrap()); assert_eq!(compact_space.to_compact(100).unwrap_err(), 1); + for (num1, num2) in (0..3).tuple_windows() { + assert_eq!( + compact_space.get_range_mapping(num1).compact_end() + 1, + compact_space.get_range_mapping(num2).compact_start + ); + } + let mut output: Vec = Vec::new(); compact_space.serialize(&mut output).unwrap(); @@ -521,39 +517,50 @@ mod tests { fn compact_space_amplitude_test() { let ips = &[100000u128, 1000000].into_iter().collect(); let compact_space = get_compact_space(ips, ips.len(), 1); - assert_eq!(compact_space.null_value, 100001); let amplitude = compact_space.amplitude_compact_space(); assert_eq!(amplitude, 3); } - fn test_all(data: OwnedBytes, expected: &[u128]) { + fn test_all(data: OwnedBytes, expected: &[Option]) { let decompressor = CompactSpaceDecompressor::open(data).unwrap(); for (idx, expected_val) in expected.iter().cloned().enumerate() { let val = decompressor.get(idx as u64); - assert_eq!(val, Some(expected_val)); - let positions = decompressor.get_range(expected_val.saturating_sub(1)..=expected_val); - assert!(positions.contains(&(idx as u64))); - let positions = decompressor.get_range(expected_val..=expected_val); - assert!(positions.contains(&(idx as u64))); - let positions = decompressor.get_range(expected_val..=expected_val.saturating_add(1)); - assert!(positions.contains(&(idx as u64))); - let positions = decompressor - .get_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1)); - assert!(positions.contains(&(idx as u64))); + assert_eq!(val, expected_val); + + if let Some(expected_val) = expected_val { + let test_range = |range: RangeInclusive| { + let expected_positions = expected + .iter() + .positions(|val| val.map(|val| range.contains(&val)).unwrap_or(false)) + .map(|pos| pos as u64) + .collect::>(); + let positions = decompressor.get_range(range); + assert_eq!(positions, expected_positions); + }; + + test_range(expected_val.saturating_sub(1)..=expected_val); + test_range(expected_val..=expected_val); + test_range(expected_val..=expected_val.saturating_add(1)); + test_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1)); + } } } - fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { - let compressor = - CompactSpaceCompressor::train_from(u128_vals.iter().cloned(), u128_vals.len()); - let data = compressor - .compress(u128_vals.iter().cloned().map(Some)) - .unwrap(); + fn test_aux_vals_opt(u128_vals: &[Option]) -> OwnedBytes { + let compressor = CompactSpaceCompressor::train_from( + u128_vals.iter().cloned().flatten(), + u128_vals.len(), + ); + let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); let data = OwnedBytes::new(data); test_all(data.clone(), u128_vals); data } + fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { + test_aux_vals_opt(&u128_vals.iter().cloned().map(Some).collect::>()) + } + #[test] fn test_range_1() { let vals = &[ @@ -694,6 +701,7 @@ mod tests { let vals = &[1_000_000_000u128; 100]; let _data = test_aux_vals(vals); } + use itertools::Itertools; use proptest::prelude::*; fn num_strategy() -> impl Strategy { diff --git a/fastfield_codecs/src/compact_space/blank_range.rs b/fastfield_codecs/src/compact_space/blank_range.rs index 11a9c7eda..a1f265f00 100644 --- a/fastfield_codecs/src/compact_space/blank_range.rs +++ b/fastfield_codecs/src/compact_space/blank_range.rs @@ -38,6 +38,6 @@ impl Ord for BlankRange { } impl PartialOrd for BlankRange { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.blank_size().cmp(&other.blank_size)) + Some(self.blank_size().cmp(&other.blank_size())) } } diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index 238408ac5..b72ea2243 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -4,7 +4,7 @@ use std::ops::RangeInclusive; use itertools::Itertools; use super::blank_range::BlankRange; -use super::CompactSpace; +use super::{CompactSpace, RangeMapping}; /// Put the blanks for the sorted values into a binary heap fn get_blanks(values_sorted: &BTreeSet) -> BinaryHeap { @@ -78,15 +78,15 @@ pub fn get_compact_space( total_num_values: usize, cost_per_blank: usize, ) -> CompactSpace { - let mut blanks: BinaryHeap = get_blanks(values_deduped_sorted); - let mut amplitude_compact_space = u128::MAX; - let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); - let mut compact_space = CompactSpaceBuilder::new(); if values_deduped_sorted.is_empty() { return compact_space.finish(); } + let mut blanks: BinaryHeap = get_blanks(values_deduped_sorted); + let mut amplitude_compact_space = u128::MAX; + let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); + let mut blank_collector = BlankCollector::new(); // We will stage blanks until they reduce the compact space by 1 bit. // Binary heap to process the gaps by their size @@ -166,49 +166,37 @@ impl CompactSpaceBuilder { // sort by start. ranges are not allowed to overlap self.blanks.sort_unstable_by_key(|blank| *blank.start()); - // Between the blanks - let mut covered_space = self - .blanks - .iter() - .tuple_windows() - .map(|(left, right)| { - assert!( - left.end() < right.start(), - "overlapping or adjacent ranges detected" - ); - *left.end() + 1..=*right.start() - 1 - }) - .collect::>(); + let mut covered_space = Vec::with_capacity(self.blanks.len()); - // Outside the blanks + // begining of the blanks if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) { if *first_blank_start != 0 { - covered_space.insert(0, 0..=first_blank_start - 1); + covered_space.push(0..=first_blank_start - 1); } } + // Between the blanks + let between_blanks = self.blanks.iter().tuple_windows().map(|(left, right)| { + assert!( + left.end() < right.start(), + "overlapping or adjacent ranges detected" + ); + *left.end() + 1..=*right.start() - 1 + }); + covered_space.extend(between_blanks); + + // end of the blanks if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) { if *last_blank_end != u128::MAX { covered_space.push(last_blank_end + 1..=u128::MAX); } } - // Extend the first range and assign the null value to it. - let null_value = if let Some(first_covered_space) = covered_space.first_mut() { - // in case the first covered space ends at u128::MAX, assign null to the beginning - if *first_covered_space.end() == u128::MAX { - *first_covered_space = first_covered_space.start() - 1..=*first_covered_space.end(); - *first_covered_space.start() - } else { - *first_covered_space = *first_covered_space.start()..=first_covered_space.end() + 1; - *first_covered_space.end() - } - } else { + if covered_space.is_empty() { covered_space.push(0..=0); // empty data case - 0u128 }; - let mut compact_start: u64 = 0; + let mut compact_start: u64 = 1; // 0 is reserved for `null` let mut ranges_mapping: Vec = Vec::with_capacity(covered_space.len()); for cov in covered_space { let range_mapping = super::RangeMapping { @@ -219,10 +207,7 @@ impl CompactSpaceBuilder { ranges_mapping.push(range_mapping); compact_start += covered_range_len as u64; } - CompactSpace { - ranges_mapping, - null_value, - } + CompactSpace { ranges_mapping } } }