use 0 as null in compact space

This commit is contained in:
Pascal Seitz
2022-09-13 15:43:14 +08:00
parent 58af1235e4
commit 61b5110db7
3 changed files with 91 additions and 98 deletions

View File

@@ -36,6 +36,8 @@ pub fn ip_to_u128(ip_addr: IpAddr) -> u128 {
u128::from_be_bytes(ip_addr_v6.octets())
}
const NULL_VALUE_COMPACT_SPACE: u64 = 0;
/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of
/// blanks depends on the number of blanks.
///
@@ -45,7 +47,6 @@ const COST_PER_BLANK_IN_BITS: usize = 36;
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct CompactSpace {
ranges_mapping: Vec<RangeMapping>,
pub null_value: u128,
}
/// Maps the range from the original space to compact_start + range.len()
@@ -58,11 +59,15 @@ impl RangeMapping {
fn range_length(&self) -> u64 {
(self.value_range.end() - self.value_range.start()) as u64 + 1
}
// The last value of the compact space in this range
fn compact_end(&self) -> u64 {
self.compact_start + self.range_length() - 1
}
}
impl BinarySerializable for CompactSpace {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
VIntU128(self.null_value).serialize(writer)?;
VInt(self.ranges_mapping.len() as u64).serialize(writer)?;
let mut prev_value = 0;
@@ -84,12 +89,11 @@ impl BinarySerializable for CompactSpace {
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let null_value = VIntU128::deserialize(reader)?.0;
let num_values = VInt::deserialize(reader)?.0;
let num_ranges = VInt::deserialize(reader)?.0;
let mut ranges_mapping: Vec<RangeMapping> = vec![];
let mut value = 0u128;
let mut compact_start = 0u64;
for _ in 0..num_values {
let mut compact_start = 1u64; // 0 is reserved for `null`
for _ in 0..num_ranges {
let blank_delta_start = VIntU128::deserialize(reader)?.0;
value += blank_delta_start;
let blank_start = value;
@@ -102,15 +106,12 @@ impl BinarySerializable for CompactSpace {
value_range: blank_start..=blank_end,
compact_start,
};
let compact_delta = range_mapping.range_length();
let range_length = range_mapping.range_length();
ranges_mapping.push(range_mapping);
compact_start += compact_delta as u64;
compact_start += range_length as u64;
}
Ok(Self {
null_value,
ranges_mapping,
})
Ok(Self { ranges_mapping })
}
}
@@ -120,8 +121,10 @@ impl CompactSpace {
///
/// It's only used to verify we don't exceed u64 number space, which would indicate a bug.
fn amplitude_compact_space(&self) -> u128 {
let last_range = &self.ranges_mapping[self.ranges_mapping.len() - 1];
last_range.compact_start as u128 + last_range.range_length() as u128
self.ranges_mapping
.last()
.map(|last_range| last_range.compact_end() as u128 + 1)
.unwrap_or(1) // compact space starts at 1, 0 == null
}
fn get_range_mapping(&self, pos: usize) -> &RangeMapping {
@@ -171,7 +174,6 @@ pub struct CompactSpaceCompressor {
pub struct IPCodecParams {
compact_space: CompactSpace,
bit_unpacker: BitUnpacker,
null_value_compact_space: u64,
min_value: u128,
max_value: u128,
num_vals: u64,
@@ -179,10 +181,6 @@ pub struct IPCodecParams {
}
impl CompactSpaceCompressor {
pub fn null_value_compact_space(&self) -> u64 {
self.params.null_value_compact_space
}
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
pub fn train_from(
vals: impl Iterator<Item = u128>,
@@ -225,7 +223,7 @@ impl CompactSpaceCompressor {
)
})?
} else {
self.null_value_compact_space()
NULL_VALUE_COMPACT_SPACE
};
bitpacker.write(compact, self.params.num_bits, write)?;
}
@@ -237,9 +235,6 @@ impl CompactSpaceCompressor {
fn train(values_sorted: &BTreeSet<u128>, total_num_values: usize) -> CompactSpaceCompressor {
let compact_space = get_compact_space(values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
let null_compact_space = compact_space
.to_compact(compact_space.null_value)
.expect("could not convert null_value to compact space");
let amplitude_compact_space = compact_space.amplitude_compact_space();
assert!(
@@ -250,10 +245,9 @@ fn train(values_sorted: &BTreeSet<u128>, total_num_values: usize) -> CompactSpac
let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64);
let min_value = *values_sorted.iter().next().unwrap_or(&0);
let max_value = *values_sorted.iter().last().unwrap_or(&0);
let max_value_in_value_space = max_value.max(compact_space.null_value);
assert!(
compact_space
.to_compact(max_value_in_value_space)
.to_compact(max_value)
.expect("could not convert max value to compact space")
< amplitude_compact_space as u64
);
@@ -261,7 +255,6 @@ fn train(values_sorted: &BTreeSet<u128>, total_num_values: usize) -> CompactSpac
params: IPCodecParams {
compact_space,
bit_unpacker: BitUnpacker::new(num_bits),
null_value_compact_space: null_compact_space,
min_value,
max_value,
num_vals: total_num_values as u64,
@@ -282,7 +275,6 @@ impl BinarySerializable for IPCodecParams {
let footer_flags = 0u64;
footer_flags.serialize(writer)?;
VIntU128(self.null_value_compact_space as u128).serialize(writer)?;
VIntU128(self.min_value).serialize(writer)?;
VIntU128(self.max_value).serialize(writer)?;
VIntU128(self.num_vals as u128).serialize(writer)?;
@@ -295,7 +287,6 @@ impl BinarySerializable for IPCodecParams {
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let _header_flags = u64::deserialize(reader)?;
let null_value_compact_space = VIntU128::deserialize(reader)?.0 as u64;
let min_value = VIntU128::deserialize(reader)?.0;
let max_value = VIntU128::deserialize(reader)?.0;
let num_vals = VIntU128::deserialize(reader)?.0 as u64;
@@ -305,7 +296,6 @@ impl BinarySerializable for IPCodecParams {
Ok(Self {
compact_space,
bit_unpacker: BitUnpacker::new(num_bits),
null_value_compact_space,
min_value,
max_value,
num_vals,
@@ -406,7 +396,7 @@ impl CompactSpaceDecompressor {
// Get end of previous range
let pos = pos - 1;
let range_mapping = self.params.compact_space.get_range_mapping(pos);
range_mapping.compact_start + range_mapping.range_length()
range_mapping.compact_end()
});
let range = compact_from..=compact_to;
@@ -415,8 +405,8 @@ impl CompactSpaceDecompressor {
let step_size = 4;
let cutoff = self.params.num_vals - self.params.num_vals % step_size;
let mut check_add = |idx, val| {
if range.contains(&val) && val != self.params.null_value_compact_space {
let mut add_if_in_range = |idx, val| {
if range.contains(&val) {
positions.push(idx);
}
};
@@ -431,15 +421,15 @@ impl CompactSpaceDecompressor {
let val2 = get_val(idx2);
let val3 = get_val(idx3);
let val4 = get_val(idx4);
check_add(idx1, val1);
check_add(idx2, val2);
check_add(idx3, val3);
check_add(idx4, val4);
add_if_in_range(idx1, val1);
add_if_in_range(idx2, val2);
add_if_in_range(idx3, val3);
add_if_in_range(idx4, val4);
}
// handle rest
for idx in cutoff..self.params.num_vals {
check_add(idx, get_val(idx));
add_if_in_range(idx, get_val(idx));
}
positions
@@ -456,7 +446,7 @@ impl CompactSpaceDecompressor {
// TODO: Performance. It would be better to iterate on the ranges and check existence via
// the bit_unpacker.
self.iter_compact().map(|compact| {
if compact == self.params.null_value_compact_space {
if compact == NULL_VALUE_COMPACT_SPACE {
None
} else {
Some(self.compact_to_u128(compact))
@@ -467,7 +457,7 @@ impl CompactSpaceDecompressor {
#[inline]
pub fn get(&self, idx: u64) -> Option<u128> {
let compact = self.params.bit_unpacker.get(idx, &self.data);
if compact == self.params.null_value_compact_space {
if compact == NULL_VALUE_COMPACT_SPACE {
None
} else {
Some(self.compact_to_u128(compact))
@@ -496,13 +486,19 @@ mod tests {
.into_iter()
.collect();
let compact_space = get_compact_space(ips, ips.len(), 11);
assert_eq!(compact_space.null_value, 5);
let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 20);
assert_eq!(2, compact_space.to_compact(2).unwrap());
assert_eq!(3, compact_space.to_compact(3).unwrap());
assert_eq!(3, compact_space.to_compact(2).unwrap());
assert_eq!(4, compact_space.to_compact(3).unwrap());
assert_eq!(compact_space.to_compact(100).unwrap_err(), 1);
for (num1, num2) in (0..3).tuple_windows() {
assert_eq!(
compact_space.get_range_mapping(num1).compact_end() + 1,
compact_space.get_range_mapping(num2).compact_start
);
}
let mut output: Vec<u8> = Vec::new();
compact_space.serialize(&mut output).unwrap();
@@ -521,39 +517,50 @@ mod tests {
fn compact_space_amplitude_test() {
let ips = &[100000u128, 1000000].into_iter().collect();
let compact_space = get_compact_space(ips, ips.len(), 1);
assert_eq!(compact_space.null_value, 100001);
let amplitude = compact_space.amplitude_compact_space();
assert_eq!(amplitude, 3);
}
fn test_all(data: OwnedBytes, expected: &[u128]) {
fn test_all(data: OwnedBytes, expected: &[Option<u128>]) {
let decompressor = CompactSpaceDecompressor::open(data).unwrap();
for (idx, expected_val) in expected.iter().cloned().enumerate() {
let val = decompressor.get(idx as u64);
assert_eq!(val, Some(expected_val));
let positions = decompressor.get_range(expected_val.saturating_sub(1)..=expected_val);
assert!(positions.contains(&(idx as u64)));
let positions = decompressor.get_range(expected_val..=expected_val);
assert!(positions.contains(&(idx as u64)));
let positions = decompressor.get_range(expected_val..=expected_val.saturating_add(1));
assert!(positions.contains(&(idx as u64)));
let positions = decompressor
.get_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1));
assert!(positions.contains(&(idx as u64)));
assert_eq!(val, expected_val);
if let Some(expected_val) = expected_val {
let test_range = |range: RangeInclusive<u128>| {
let expected_positions = expected
.iter()
.positions(|val| val.map(|val| range.contains(&val)).unwrap_or(false))
.map(|pos| pos as u64)
.collect::<Vec<_>>();
let positions = decompressor.get_range(range);
assert_eq!(positions, expected_positions);
};
test_range(expected_val.saturating_sub(1)..=expected_val);
test_range(expected_val..=expected_val);
test_range(expected_val..=expected_val.saturating_add(1));
test_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1));
}
}
}
fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes {
let compressor =
CompactSpaceCompressor::train_from(u128_vals.iter().cloned(), u128_vals.len());
let data = compressor
.compress(u128_vals.iter().cloned().map(Some))
.unwrap();
fn test_aux_vals_opt(u128_vals: &[Option<u128>]) -> OwnedBytes {
let compressor = CompactSpaceCompressor::train_from(
u128_vals.iter().cloned().flatten(),
u128_vals.len(),
);
let data = compressor.compress(u128_vals.iter().cloned()).unwrap();
let data = OwnedBytes::new(data);
test_all(data.clone(), u128_vals);
data
}
fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes {
test_aux_vals_opt(&u128_vals.iter().cloned().map(Some).collect::<Vec<_>>())
}
#[test]
fn test_range_1() {
let vals = &[
@@ -694,6 +701,7 @@ mod tests {
let vals = &[1_000_000_000u128; 100];
let _data = test_aux_vals(vals);
}
use itertools::Itertools;
use proptest::prelude::*;
fn num_strategy() -> impl Strategy<Value = u128> {

View File

@@ -38,6 +38,6 @@ impl Ord for BlankRange {
}
impl PartialOrd for BlankRange {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.blank_size().cmp(&other.blank_size))
Some(self.blank_size().cmp(&other.blank_size()))
}
}

View File

@@ -4,7 +4,7 @@ use std::ops::RangeInclusive;
use itertools::Itertools;
use super::blank_range::BlankRange;
use super::CompactSpace;
use super::{CompactSpace, RangeMapping};
/// Put the blanks for the sorted values into a binary heap
fn get_blanks(values_sorted: &BTreeSet<u128>) -> BinaryHeap<BlankRange> {
@@ -78,15 +78,15 @@ pub fn get_compact_space(
total_num_values: usize,
cost_per_blank: usize,
) -> CompactSpace {
let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);
let mut amplitude_compact_space = u128::MAX;
let mut amplitude_bits: u8 = num_bits(amplitude_compact_space);
let mut compact_space = CompactSpaceBuilder::new();
if values_deduped_sorted.is_empty() {
return compact_space.finish();
}
let mut blanks: BinaryHeap<BlankRange> = get_blanks(values_deduped_sorted);
let mut amplitude_compact_space = u128::MAX;
let mut amplitude_bits: u8 = num_bits(amplitude_compact_space);
let mut blank_collector = BlankCollector::new();
// We will stage blanks until they reduce the compact space by 1 bit.
// Binary heap to process the gaps by their size
@@ -166,49 +166,37 @@ impl CompactSpaceBuilder {
// sort by start. ranges are not allowed to overlap
self.blanks.sort_unstable_by_key(|blank| *blank.start());
// Between the blanks
let mut covered_space = self
.blanks
.iter()
.tuple_windows()
.map(|(left, right)| {
assert!(
left.end() < right.start(),
"overlapping or adjacent ranges detected"
);
*left.end() + 1..=*right.start() - 1
})
.collect::<Vec<_>>();
let mut covered_space = Vec::with_capacity(self.blanks.len());
// Outside the blanks
// begining of the blanks
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
if *first_blank_start != 0 {
covered_space.insert(0, 0..=first_blank_start - 1);
covered_space.push(0..=first_blank_start - 1);
}
}
// Between the blanks
let between_blanks = self.blanks.iter().tuple_windows().map(|(left, right)| {
assert!(
left.end() < right.start(),
"overlapping or adjacent ranges detected"
);
*left.end() + 1..=*right.start() - 1
});
covered_space.extend(between_blanks);
// end of the blanks
if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) {
if *last_blank_end != u128::MAX {
covered_space.push(last_blank_end + 1..=u128::MAX);
}
}
// Extend the first range and assign the null value to it.
let null_value = if let Some(first_covered_space) = covered_space.first_mut() {
// in case the first covered space ends at u128::MAX, assign null to the beginning
if *first_covered_space.end() == u128::MAX {
*first_covered_space = first_covered_space.start() - 1..=*first_covered_space.end();
*first_covered_space.start()
} else {
*first_covered_space = *first_covered_space.start()..=first_covered_space.end() + 1;
*first_covered_space.end()
}
} else {
if covered_space.is_empty() {
covered_space.push(0..=0); // empty data case
0u128
};
let mut compact_start: u64 = 0;
let mut compact_start: u64 = 1; // 0 is reserved for `null`
let mut ranges_mapping: Vec<RangeMapping> = Vec::with_capacity(covered_space.len());
for cov in covered_space {
let range_mapping = super::RangeMapping {
@@ -219,10 +207,7 @@ impl CompactSpaceBuilder {
ranges_mapping.push(range_mapping);
compact_start += covered_range_len as u64;
}
CompactSpace {
ranges_mapping,
null_value,
}
CompactSpace { ranges_mapping }
}
}