From bc859471050768972ae44c32a8ae21ee5010ade8 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 8 Sep 2022 13:11:52 +0800 Subject: [PATCH 01/26] add ip codec --- common/src/lib.rs | 5 +- common/src/vint.rs | 90 +++ fastfield_codecs/src/column.rs | 42 ++ fastfield_codecs/src/compact_space.rs | 783 ++++++++++++++++++++++++++ fastfield_codecs/src/lib.rs | 26 + 5 files changed, 945 insertions(+), 1 deletion(-) create mode 100644 fastfield_codecs/src/compact_space.rs diff --git a/common/src/lib.rs b/common/src/lib.rs index 0a21fbbc5..4463c46e1 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -11,7 +11,10 @@ mod writer; pub use bitset::*; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; -pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt}; +pub use vint::{ + deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128, + serialize_vint_u32, write_u32_vint, VInt, VIntU128, +}; pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite}; /// Has length trait diff --git a/common/src/vint.rs b/common/src/vint.rs index 0385a7f6a..de11451df 100644 --- a/common/src/vint.rs +++ b/common/src/vint.rs @@ -5,6 +5,75 @@ use byteorder::{ByteOrder, LittleEndian}; use super::BinarySerializable; +/// Variable int serializes a u128 number +pub fn serialize_vint_u128(mut val: u128, output: &mut Vec) { + loop { + let next_byte: u8 = (val % 128u128) as u8; + val /= 128u128; + if val == 0 { + output.push(next_byte | STOP_BIT); + return; + } else { + output.push(next_byte); + } + } +} + +/// Deserializes a u128 number +/// +/// Returns the number and the slice after the vint +pub fn deserialize_vint_u128(data: &[u8]) -> io::Result<(u128, &[u8])> { + let mut result = 0u128; + let mut shift = 0u64; + for i in 0..19 { + let b = data[i]; + result |= u128::from(b % 128u8) << shift; + if b >= STOP_BIT { + return Ok((result, &data[i + 1..])); + } + shift += 7; + } + Err(io::Error::new( + io::ErrorKind::InvalidData, + "Failed to deserialize u128 vint", + )) +} + +/// Wrapper over a `u128` that serializes as a variable int. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct VIntU128(pub u128); + +impl BinarySerializable for VIntU128 { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + let mut buffer = vec![]; + serialize_vint_u128(self.0, &mut buffer); + writer.write_all(&buffer) + } + + fn deserialize(reader: &mut R) -> io::Result { + let mut bytes = reader.bytes(); + let mut result = 0u128; + let mut shift = 0u64; + loop { + match bytes.next() { + Some(Ok(b)) => { + result |= u128::from(b % 128u8) << shift; + if b >= STOP_BIT { + return Ok(VIntU128(result)); + } + shift += 7; + } + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Reach end of buffer while reading VInt", + )); + } + } + } + } +} + /// Wrapper over a `u64` that serializes as a variable int. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct VInt(pub u64); @@ -176,6 +245,7 @@ impl BinarySerializable for VInt { mod tests { use super::{serialize_vint_u32, BinarySerializable, VInt}; + use crate::vint::{deserialize_vint_u128, serialize_vint_u128, VIntU128}; fn aux_test_vint(val: u64) { let mut v = [14u8; 10]; @@ -217,6 +287,26 @@ mod tests { assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val); } + fn aux_test_vint_u128(val: u128) { + let mut data = vec![]; + serialize_vint_u128(val, &mut data); + let (deser_val, _data) = deserialize_vint_u128(&data).unwrap(); + assert_eq!(val, deser_val); + + let mut out = vec![]; + VIntU128(val).serialize(&mut out).unwrap(); + let deser_val = VIntU128::deserialize(&mut &out[..]).unwrap(); + assert_eq!(val, deser_val.0); + } + + #[test] + fn test_vint_u128() { + aux_test_vint_u128(0); + aux_test_vint_u128(1); + aux_test_vint_u128(u128::MAX / 3); + aux_test_vint_u128(u128::MAX); + } + #[test] fn test_vint_u32() { aux_test_serialize_vint_u32(0); diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index e9d2e9db5..00e0c092b 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -1,4 +1,5 @@ use std::marker::PhantomData; +use std::ops::RangeInclusive; use std::sync::Mutex; use tantivy_bitpacker::minmax; @@ -52,6 +53,47 @@ pub trait Column: Send + Sync { } } +/// Concept of new Column API, which better accounts for null values. +pub trait ColumnV2 { + /// Return the value associated to the given idx. + /// + /// This accessor should return as fast as possible. + /// + /// # Panics + /// + /// May panic if `idx` is greater than the column length. + fn get_val(&self, idx: u64) -> Option; + + /// Returns the minimum value for this fast field. + /// + /// This min_value may not be exact. + /// For instance, the min value does not take in account of possible + /// deleted document. All values are however guaranteed to be higher than + /// `.min_value()`. + fn min_value(&self) -> T; + + /// Returns the maximum value for this fast field. + /// + /// This max_value may not be exact. + /// For instance, the max value does not take in account of possible + /// deleted document. All values are however guaranteed to be higher than + /// `.max_value()`. + fn max_value(&self) -> T; + + fn num_vals(&self) -> u64; + + /// Returns a iterator over the data + fn iter<'a>(&'a self) -> Box> + 'a> { + Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) + } +} + +/// Extend ColumnV2 Api +pub trait ColumnV2Ext: ColumnV2 { + /// Return the positions of values which are in the provided range. + fn get_between_vals(&self, range: RangeInclusive) -> Vec; +} + pub struct VecColumn<'a, T = u64> { values: &'a [T], min_value: T, diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs new file mode 100644 index 000000000..dbab2473a --- /dev/null +++ b/fastfield_codecs/src/compact_space.rs @@ -0,0 +1,783 @@ +/// This codec takes a large number space (u128) and reduces it to a compact number space. +/// +/// It will find spaces in the numer range. For example: +/// +/// 100, 101, 102, 103, 104, 50000, 50001 +/// could be mapped to +/// 100..104 -> 0..4 +/// 50000..50001 -> 5..6 +/// +/// Compact space 0..=6 requires much less bits than 100..=50001 +/// +/// The codec is created to compress ip addresses, but may be employed in other use cases. +use std::{ + cmp::Ordering, + collections::BinaryHeap, + io::{self, Write}, + net::{IpAddr, Ipv6Addr}, + ops::RangeInclusive, +}; + +use common::{BinarySerializable, CountingWriter, VIntU128}; +use ownedbytes::OwnedBytes; +use tantivy_bitpacker::{self, BitPacker, BitUnpacker}; + +use crate::column::{ColumnV2, ColumnV2Ext}; + +pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { + let ip_addr_v6: Ipv6Addr = match ip_addr { + IpAddr::V4(v4) => v4.to_ipv6_mapped(), + IpAddr::V6(v6) => v6, + }; + u128::from_be_bytes(ip_addr_v6.octets()) +} + +const INTERVAL_COST_IN_BITS: usize = 64; + +/// Store blank size and position. Order by blank size. +/// +/// A blank is an unoccupied space in the data. +/// E.g. [100, 201] would have a `BlankAndPos{ pos: 0, blank_size: 100}` +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +struct BlankSizeAndPos { + blank_size: u128, + /// Position in the sorted data. + pos: usize, +} +impl BlankSizeAndPos { + fn new(blank: u128, pos: usize) -> Self { + BlankSizeAndPos { + blank_size: blank, + pos, + } + } +} + +impl Ord for BlankSizeAndPos { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.blank_size.cmp(&other.blank_size) + } +} +impl PartialOrd for BlankSizeAndPos { + fn partial_cmp(&self, other: &Self) -> Option { + self.blank_size.partial_cmp(&other.blank_size) + } +} + +#[test] +fn test_delta_and_pos_sort() { + let mut deltas: BinaryHeap = BinaryHeap::new(); + deltas.push(BlankSizeAndPos::new(10, 1)); + deltas.push(BlankSizeAndPos::new(100, 10)); + deltas.push(BlankSizeAndPos::new(1, 10)); + assert_eq!(deltas.pop().unwrap().blank_size, 100); + assert_eq!(deltas.pop().unwrap().blank_size, 10); +} + +/// Put the deltas for the sorted values into a binary heap +fn get_deltas(values_sorted: &[u128]) -> BinaryHeap { + let mut prev_opt = None; + let mut deltas: BinaryHeap = BinaryHeap::new(); + for (pos, value) in values_sorted.iter().cloned().enumerate() { + let delta = if let Some(prev) = prev_opt { + value - prev + } else { + value + 1 + }; + // skip too small deltas + if delta > 2 { + deltas.push(BlankSizeAndPos::new(delta, pos)); + } + prev_opt = Some(value); + } + deltas +} + +struct BlankCollector { + blanks: Vec, + staged_blanks_sum: u128, +} +impl BlankCollector { + fn new() -> Self { + Self { + blanks: vec![], + staged_blanks_sum: 0, + } + } + fn stage_blank(&mut self, blank: BlankSizeAndPos) { + self.staged_blanks_sum += blank.blank_size - 1; + self.blanks.push(blank); + } + fn drain(&mut self) -> std::vec::Drain<'_, BlankSizeAndPos> { + self.staged_blanks_sum = 0; + self.blanks.drain(..) + } + fn staged_blanks_sum(&self) -> u128 { + self.staged_blanks_sum + } + fn num_blanks(&self) -> usize { + self.blanks.len() + } +} + +/// Will collect blanks and add them to compact space if more bits are saved than cost from +/// metadata. +fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSpace { + let max_val = *values_sorted.last().unwrap_or(&0u128) + 1; + let mut deltas = get_deltas(values_sorted); + let mut amplitude_compact_space = max_val; + let mut amplitude_bits: u8 = (amplitude_compact_space as f64).log2().ceil() as u8; + + let mut compact_space = CompactSpaceBuilder::new(); + + let mut blank_collector = BlankCollector::new(); + // We will stage blanks until they reduce the compact space by 1 bit. + // Binary heap to process the gaps by their size + while let Some(delta_and_pos) = deltas.pop() { + blank_collector.stage_blank(delta_and_pos); + + let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum(); + // +1 for later added null value + let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum + 1; + let amplitude_new_bits = (amplitude_new_compact_space as f64).log2().ceil() as u8; + if amplitude_bits == amplitude_new_bits { + continue; + } + let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * values_sorted.len(); + let cost = blank_collector.num_blanks() * cost_per_blank; + if cost >= saved_bits { + // Continue here, since although we walk over the deltas by size, + // we can potentially save a lot at the last bits, which are smaller deltas + // + // E.g. if the first range reduces the compact space by 1000 from 2000 to 1000, which + // saves 11-10=1 bit and the next range reduces the compact space by 950 to + // 50, which saves 10-6=4 bit + continue; + } + + amplitude_compact_space = amplitude_new_compact_space; + amplitude_bits = amplitude_new_bits; + for pos in blank_collector + .drain() + .map(|blank_and_pos| blank_and_pos.pos) + { + let blank_end = values_sorted[pos] - 1; + let blank_start = if pos == 0 { + 0 + } else { + values_sorted[pos - 1] + 1 + }; + compact_space.add_blank(blank_start..=blank_end); + } + } + compact_space.add_blank(max_val..=u128::MAX); + + compact_space.finish() +} + +#[test] +fn compact_space_test() { + let ips = vec![ + 2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260, + ]; + let compact_space = get_compact_space(&ips, 11); + assert_eq!(compact_space.null_value, 5); + let amplitude = compact_space.amplitude_compact_space(); + assert_eq!(amplitude, 20); + assert_eq!(2, compact_space.to_compact(2).unwrap()); + assert_eq!(compact_space.to_compact(100).unwrap_err(), 0); + + let mut output = vec![]; + compact_space.serialize(&mut output).unwrap(); + + assert_eq!( + compact_space, + CompactSpace::deserialize(&mut &output[..]).unwrap() + ); + + for ip in &ips { + let compact = compact_space.to_compact(*ip).unwrap(); + assert_eq!(compact_space.unpack(compact), *ip); + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +struct CompactSpaceBuilder { + covered_space: Vec>, +} + +impl CompactSpaceBuilder { + /// Creates a new compact space builder which will initially cover the whole space. + fn new() -> Self { + Self { + covered_space: vec![0..=u128::MAX], + } + } + + /// Will extend the first range and assign the null value to it. + fn assign_and_return_null(&mut self) -> u128 { + self.covered_space[0] = *self.covered_space[0].start()..=*self.covered_space[0].end() + 1; + *self.covered_space[0].end() + } + + /// Assumes that repeated add_blank calls don't overlap, which will be the case on sorted + /// values. + fn add_blank(&mut self, blank: RangeInclusive) { + let position = self + .covered_space + .iter() + .position(|range| range.start() <= blank.start() && range.end() >= blank.end()); + if let Some(position) = position { + let old_range = self.covered_space.remove(position); + // Exact match, just remove + if old_range == blank { + return; + } + let new_range_end = blank.end().saturating_add(1)..=*old_range.end(); + if old_range.start() == blank.start() { + self.covered_space.insert(position, new_range_end); + return; + } + let new_range_start = *old_range.start()..=blank.start().saturating_sub(1); + if old_range.end() == blank.end() { + self.covered_space.insert(position, new_range_start); + return; + } + self.covered_space.insert(position, new_range_end); + self.covered_space.insert(position, new_range_start); + } + } + fn finish(mut self) -> CompactSpace { + let null_value = self.assign_and_return_null(); + + let mut compact_start: u64 = 0; + let mut ranges_and_compact_start = vec![]; + for cov in self.covered_space { + let covered_range_len = cov.end() - cov.start(); + ranges_and_compact_start.push((cov, compact_start)); + compact_start += covered_range_len as u64 + 1; + } + CompactSpace { + ranges_and_compact_start, + null_value, + } + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +struct CompactSpace { + ranges_and_compact_start: Vec<(RangeInclusive, u64)>, + pub null_value: u128, +} + +impl BinarySerializable for CompactSpace { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + VIntU128(self.null_value).serialize(writer)?; + VIntU128(self.ranges_and_compact_start.len() as u128).serialize(writer)?; + + let mut prev_value = 0; + for (value_range, _compact) in &self.ranges_and_compact_start { + let delta = value_range.start() - prev_value; + VIntU128(delta).serialize(writer)?; + prev_value = *value_range.start(); + + let delta = value_range.end() - prev_value; + VIntU128(delta).serialize(writer)?; + prev_value = *value_range.end(); + } + + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let null_value = VIntU128::deserialize(reader)?.0; + let num_values = VIntU128::deserialize(reader)?.0; + let mut ranges_and_compact_start: Vec<(RangeInclusive, u64)> = vec![]; + let mut value = 0u128; + let mut compact = 0u64; + for _ in 0..num_values { + let delta = VIntU128::deserialize(reader)?.0; + value += delta; + let value_start = value; + + let delta = VIntU128::deserialize(reader)?.0; + value += delta; + let value_end = value; + + let compact_delta = value_end - value_start + 1; + + ranges_and_compact_start.push((value_start..=value_end, compact)); + compact += compact_delta as u64; + } + + Ok(Self { + null_value, + ranges_and_compact_start, + }) + } +} + +impl CompactSpace { + fn amplitude_compact_space(&self) -> u128 { + let last_range = &self.ranges_and_compact_start[self.ranges_and_compact_start.len() - 1]; + last_range.1 as u128 + (last_range.0.end() - last_range.0.start()) + 1 + } + + fn get_range_and_compact_start(&self, pos: usize) -> &(RangeInclusive, u64) { + &self.ranges_and_compact_start[pos] + } + + /// Returns either Ok(the value in the compact space) or if it is outside the compact space the + /// Err(position on the next larger range above the value) + fn to_compact(&self, value: u128) -> Result { + self.ranges_and_compact_start + .binary_search_by(|probe| { + let value_range = &probe.0; + if *value_range.start() <= value && *value_range.end() >= value { + return Ordering::Equal; + } else if value < *value_range.start() { + return Ordering::Greater; + } else if value > *value_range.end() { + return Ordering::Less; + } + panic!("not covered all ranges in check"); + }) + .map(|pos| { + let (range, compact_start) = &self.ranges_and_compact_start[pos]; + compact_start + (value - range.start()) as u64 + }) + .map_err(|pos| pos - 1) + } + + /// Unpacks a value from compact space u64 to u128 space + fn unpack(&self, compact: u64) -> u128 { + let pos = self + .ranges_and_compact_start + .binary_search_by_key(&compact, |probe| probe.1) + .map_or_else(|e| e - 1, |v| v); + + let range_and_compact_start = &self.ranges_and_compact_start[pos]; + let diff = compact - self.ranges_and_compact_start[pos].1; + range_and_compact_start.0.start() + diff as u128 + } +} + +pub struct CompactSpaceCompressor { + params: IPCodecParams, +} +#[derive(Debug, Clone)] +pub struct IPCodecParams { + compact_space: CompactSpace, + bit_unpacker: BitUnpacker, + null_value_compact_space: u64, + null_value: u128, + min_value: u128, + max_value: u128, + num_vals: u64, + num_bits: u8, +} + +impl CompactSpaceCompressor { + pub fn null_value(&self) -> u128 { + self.params.null_value + } + + /// Taking the vals as Vec may cost a lot of memory. + /// It is used to sort the vals. + /// + /// Less memory alternative: We could just store the index (u32), and use that as sorting. + /// + /// TODO: Should we take Option here? (better api, but 24bytes instead 16 per element) + pub fn train_from(mut vals: Vec) -> Self { + vals.sort(); + train(&vals) + } + + fn to_compact(&self, value: u128) -> u64 { + self.params.compact_space.to_compact(value).unwrap() + } + + fn write_footer(mut self, writer: &mut impl Write, num_vals: u64) -> io::Result<()> { + let writer = &mut CountingWriter::wrap(writer); + self.params.num_vals = num_vals; + self.params.serialize(writer)?; + + let footer_len = writer.written_bytes() as u32; + footer_len.serialize(writer)?; + + Ok(()) + } + + pub fn compress(self, vals: impl Iterator) -> io::Result> { + let mut output = vec![]; + self.compress_into(vals, &mut output)?; + Ok(output) + } + /// TODO: Should we take Option here? Other wise the caller has to replace None with + /// `self.null_value()` + pub fn compress_into( + self, + vals: impl Iterator, + write: &mut impl Write, + ) -> io::Result<()> { + let mut bitpacker = BitPacker::default(); + let mut num_vals = 0; + for val in vals { + let compact = self.to_compact(val); + bitpacker + .write(compact, self.params.num_bits, write) + .unwrap(); + num_vals += 1; + } + bitpacker.close(write).unwrap(); + self.write_footer(write, num_vals)?; + Ok(()) + } +} + +fn train(values_sorted: &[u128]) -> CompactSpaceCompressor { + let compact_space = get_compact_space(values_sorted, INTERVAL_COST_IN_BITS); + let null_value = compact_space.null_value; + let null_compact_space = compact_space + .to_compact(null_value) + .expect("could not convert null_value to compact space"); + let amplitude_compact_space = compact_space.amplitude_compact_space(); + + assert!( + amplitude_compact_space <= u64::MAX as u128, + "case unsupported." + ); + + let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64); + let min_value = *values_sorted.first().unwrap_or(&0); + let max_value = *values_sorted.last().unwrap_or(&0); + let compressor = CompactSpaceCompressor { + params: IPCodecParams { + compact_space, + bit_unpacker: BitUnpacker::new(num_bits), + null_value_compact_space: null_compact_space, + null_value, + min_value, + max_value, + num_vals: 0, // don't use values_sorted.len() here since they don't include null values + num_bits, + }, + }; + + let max_value = *values_sorted.last().unwrap_or(&0u128).max(&null_value); + assert_eq!( + compressor.to_compact(max_value) + 1, + amplitude_compact_space as u64 + ); + compressor +} + +#[derive(Debug, Clone)] +pub struct CompactSpaceDecompressor { + data: OwnedBytes, + params: IPCodecParams, +} + +impl BinarySerializable for IPCodecParams { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + // header flags for future optional dictionary encoding + let footer_flags = 0u64; + footer_flags.serialize(writer)?; + + let null_value_compact_space = self + .compact_space + .to_compact(self.null_value) + .expect("could not convert null to compact space"); + VIntU128(null_value_compact_space as u128).serialize(writer)?; + VIntU128(self.min_value).serialize(writer)?; + VIntU128(self.max_value).serialize(writer)?; + VIntU128(self.num_vals as u128).serialize(writer)?; + self.num_bits.serialize(writer)?; + + self.compact_space.serialize(writer)?; + + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let _header_flags = u64::deserialize(reader)?; + let null_value_compact_space = VIntU128::deserialize(reader)?.0 as u64; + let min_value = VIntU128::deserialize(reader)?.0; + let max_value = VIntU128::deserialize(reader)?.0; + let num_vals = VIntU128::deserialize(reader)?.0 as u64; + let num_bits = u8::deserialize(reader)?; + let compact_space = CompactSpace::deserialize(reader)?; + let null_value = compact_space.unpack(null_value_compact_space); + + Ok(Self { + null_value, + compact_space, + bit_unpacker: BitUnpacker::new(num_bits), + null_value_compact_space, + min_value, + max_value, + num_vals, + num_bits, + }) + } +} + +impl ColumnV2 for CompactSpaceDecompressor { + fn get_val(&self, doc: u64) -> Option { + self.get(doc) + } + + fn min_value(&self) -> u128 { + self.min_value() + } + + fn max_value(&self) -> u128 { + self.max_value() + } + + fn num_vals(&self) -> u64 { + self.params.num_vals + } + + fn iter<'a>(&'a self) -> Box> + 'a> { + Box::new(self.iter()) + } +} + +impl ColumnV2Ext for CompactSpaceDecompressor { + fn get_between_vals(&self, range: RangeInclusive) -> Vec { + self.get_range(range) + } +} + +impl CompactSpaceDecompressor { + pub fn open(data: OwnedBytes) -> io::Result { + let (data_slice, footer_len_bytes) = data.split_at(data.len() - 4); + let footer_len = u32::deserialize(&mut &footer_len_bytes[..])?; + + let data_footer = &data_slice[data_slice.len() - footer_len as usize..]; + let params = IPCodecParams::deserialize(&mut &data_footer[..])?; + let decompressor = CompactSpaceDecompressor { data, params }; + + Ok(decompressor) + } + + /// Converting to compact space for the decompressor is more complex, since we may get values + /// which are outside the compact space. e.g. if we map + /// 1000 => 5 + /// 2000 => 6 + /// + /// and we want a mapping for 1005, there is no equivalent compact space. We instead return an + /// error with the index of the next range. + fn to_compact(&self, value: u128) -> Result { + self.params.compact_space.to_compact(value) + } + + fn compact_to_u128(&self, compact: u64) -> u128 { + self.params.compact_space.unpack(compact) + } + + /// Comparing on compact space: 1.2 GElements/s + /// + /// Comparing on original space: .06 GElements/s (not completely optimized) + pub fn get_range(&self, range: RangeInclusive) -> Vec { + let from_value = *range.start(); + let to_value = *range.end(); + assert!(to_value >= from_value); + let compact_from = self.to_compact(from_value); + let compact_to = self.to_compact(to_value); + // Quick return, if both ranges fall into the same non-mapped space, the range can't cover + // any values, so we can early exit + match (compact_to, compact_from) { + (Err(pos1), Err(pos2)) if pos1 == pos2 => return vec![], + _ => {} + } + + let compact_from = compact_from.unwrap_or_else(|pos| { + let range_and_compact_start = + self.params.compact_space.get_range_and_compact_start(pos); + let compact_end = range_and_compact_start.1 + + (range_and_compact_start.0.end() - range_and_compact_start.0.start()) as u64; + compact_end + 1 + }); + // If there is no compact space, we go to the closest upperbound compact space + let compact_to = compact_to.unwrap_or_else(|pos| { + let range_and_compact_start = + self.params.compact_space.get_range_and_compact_start(pos); + let compact_end = range_and_compact_start.1 + + (range_and_compact_start.0.end() - range_and_compact_start.0.start()) as u64; + compact_end + }); + + let range = compact_from..=compact_to; + let mut positions = vec![]; + + for (pos, compact_value) in self + .iter_compact() + .enumerate() + .filter(|(_pos, val)| *val != self.params.null_value_compact_space) + { + if range.contains(&compact_value) { + positions.push(pos as u64); + } + } + + positions + } + + #[inline] + fn iter_compact(&self) -> impl Iterator + '_ { + (0..self.params.num_vals) + .map(move |idx| self.params.bit_unpacker.get(idx as u64, &self.data) as u64) + } + + #[inline] + fn iter(&self) -> impl Iterator> + '_ { + // TODO: Performance. It would be better to iterate on the ranges and check existence via + // the bit_unpacker. + self.iter_compact().map(|compact| { + if compact == self.params.null_value_compact_space { + None + } else { + Some(self.compact_to_u128(compact)) + } + }) + } + + pub fn get(&self, idx: u64) -> Option { + let compact = self.params.bit_unpacker.get(idx, &self.data); + if compact == self.params.null_value_compact_space { + None + } else { + Some(self.compact_to_u128(compact)) + } + } + + pub fn min_value(&self) -> u128 { + self.params.min_value + } + + pub fn max_value(&self) -> u128 { + self.params.max_value + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + fn decode_all(data: OwnedBytes) -> Vec { + let decompressor = CompactSpaceDecompressor::open(data).unwrap(); + let mut u128_vals = Vec::new(); + for idx in 0..decompressor.params.num_vals as usize { + let val = decompressor.get(idx as u64); + if let Some(val) = val { + u128_vals.push(val); + } + } + u128_vals + } + + fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { + let compressor = CompactSpaceCompressor::train_from(u128_vals.to_vec()); + let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); + let data = OwnedBytes::new(data); + let decoded_val = decode_all(data.clone()); + assert_eq!(&decoded_val, u128_vals); + data + } + + #[test] + fn test_range_1() { + let vals = &[ + 1u128, + 100u128, + 3u128, + 99999u128, + 100000u128, + 100001u128, + 4_000_211_221u128, + 4_000_211_222u128, + 333u128, + ]; + let data = test_aux_vals(vals); + let decomp = CompactSpaceDecompressor::open(data).unwrap(); + let positions = decomp.get_range(0..=1); + assert_eq!(positions, vec![0]); + let positions = decomp.get_range(0..=2); + assert_eq!(positions, vec![0]); + let positions = decomp.get_range(0..=3); + assert_eq!(positions, vec![0, 2]); + assert_eq!(decomp.get_range(99999u128..=99999u128), vec![3]); + assert_eq!(decomp.get_range(99998u128..=100000u128), vec![3, 4]); + assert_eq!(decomp.get_range(99998u128..=99999u128), vec![3]); + assert_eq!(decomp.get_range(99998u128..=99998u128), vec![]); + assert_eq!(decomp.get_range(333u128..=333u128), vec![8]); + assert_eq!(decomp.get_range(332u128..=333u128), vec![8]); + assert_eq!(decomp.get_range(332u128..=334u128), vec![8]); + assert_eq!(decomp.get_range(333u128..=334u128), vec![8]); + + assert_eq!( + decomp.get_range(4_000_211_221u128..=5_000_000_000u128), + vec![6, 7] + ); + } + + #[test] + fn test_empty() { + let vals = &[]; + let data = test_aux_vals(vals); + let _decomp = CompactSpaceDecompressor::open(data).unwrap(); + } + + #[test] + fn test_range_2() { + let vals = &[ + 100u128, + 99999u128, + 100000u128, + 100001u128, + 4_000_211_221u128, + 4_000_211_222u128, + 333u128, + ]; + let data = test_aux_vals(vals); + let decomp = CompactSpaceDecompressor::open(data).unwrap(); + let positions = decomp.get_range(0..=5); + assert_eq!(positions, vec![]); + let positions = decomp.get_range(0..=100); + assert_eq!(positions, vec![0]); + let positions = decomp.get_range(0..=105); + assert_eq!(positions, vec![0]); + } + + #[test] + fn test_null() { + let vals = &[2u128]; + let compressor = CompactSpaceCompressor::train_from(vals.to_vec()); + let vals = vec![compressor.null_value(), 2u128]; + let data = compressor.compress(vals.iter().cloned()).unwrap(); + let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); + let positions = decomp.get_range(0..=1); + assert_eq!(positions, vec![]); + let positions = decomp.get_range(2..=2); + assert_eq!(positions, vec![1]); + } + + #[test] + fn test_first_large_gaps() { + let vals = &[1_000_000_000u128; 100]; + let _data = test_aux_vals(vals); + } + use proptest::prelude::*; + + proptest! { + + #[test] + fn compress_decompress_random(vals in proptest::collection::vec(any::() + , 1..1000)) { + let _data = test_aux_vals(&vals); + } + } +} diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index bf4a1ad34..a1ee70d78 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -17,6 +17,7 @@ use serialize::Header; mod bitpacked; mod blockwise_linear; +mod compact_space; mod line; mod linear; mod monotonic_mapping; @@ -28,6 +29,7 @@ mod serialize; use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; pub use self::column::{monotonic_map_column, Column, VecColumn}; +pub use self::compact_space::{ip_to_u128, CompactSpaceCompressor, CompactSpaceDecompressor}; use self::linear::LinearCodec; pub use self::monotonic_mapping::MonotonicallyMappableToU64; use self::serialize::NormalizedHeader; @@ -338,6 +340,8 @@ mod bench { use rand::prelude::*; use test::{self, Bencher}; + use super::*; + use crate::column::ColumnV2; use crate::Column; // Warning: this generates the same permutation at each call @@ -381,6 +385,28 @@ mod bench { }); } + #[bench] + fn bench_intfastfield_jumpy_fflookup_u128(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + let permutation = permutation.iter().map(|el| *el as u128).collect::>(); + + let compressor = CompactSpaceCompressor::train_from(permutation.to_vec()); + let data = compressor.compress(permutation.iter().cloned()).unwrap(); + let data = OwnedBytes::new(data); + + let column: Arc> = + Arc::new(CompactSpaceDecompressor::open(data).unwrap()); + + b.iter(|| { + let mut a = 0u128; + for _ in 0..n { + a = column.get_val(a as u64).unwrap(); + } + a + }); + } + #[bench] fn bench_intfastfield_stride7_vec(b: &mut Bencher) { let permutation = generate_permutation(); From ced21b87911e86d43e042db845387d37a1e1cabf Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 9 Sep 2022 16:35:13 +0800 Subject: [PATCH 02/26] move tests --- fastfield_codecs/src/compact_space.rs | 111 ++++++++++++++------------ 1 file changed, 59 insertions(+), 52 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index dbab2473a..25d484a62 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -44,14 +44,6 @@ struct BlankSizeAndPos { /// Position in the sorted data. pos: usize, } -impl BlankSizeAndPos { - fn new(blank: u128, pos: usize) -> Self { - BlankSizeAndPos { - blank_size: blank, - pos, - } - } -} impl Ord for BlankSizeAndPos { fn cmp(&self, other: &Self) -> std::cmp::Ordering { @@ -64,29 +56,19 @@ impl PartialOrd for BlankSizeAndPos { } } -#[test] -fn test_delta_and_pos_sort() { - let mut deltas: BinaryHeap = BinaryHeap::new(); - deltas.push(BlankSizeAndPos::new(10, 1)); - deltas.push(BlankSizeAndPos::new(100, 10)); - deltas.push(BlankSizeAndPos::new(1, 10)); - assert_eq!(deltas.pop().unwrap().blank_size, 100); - assert_eq!(deltas.pop().unwrap().blank_size, 10); -} - /// Put the deltas for the sorted values into a binary heap fn get_deltas(values_sorted: &[u128]) -> BinaryHeap { let mut prev_opt = None; let mut deltas: BinaryHeap = BinaryHeap::new(); for (pos, value) in values_sorted.iter().cloned().enumerate() { - let delta = if let Some(prev) = prev_opt { + let blank_size = if let Some(prev) = prev_opt { value - prev } else { value + 1 }; // skip too small deltas - if delta > 2 { - deltas.push(BlankSizeAndPos::new(delta, pos)); + if blank_size > 2 { + deltas.push(BlankSizeAndPos { blank_size, pos }); } prev_opt = Some(value); } @@ -120,13 +102,17 @@ impl BlankCollector { } } +fn num_bits(val: u128) -> u8 { + (128u32 - val.leading_zeros()) as u8 +} + /// Will collect blanks and add them to compact space if more bits are saved than cost from /// metadata. fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSpace { - let max_val = *values_sorted.last().unwrap_or(&0u128) + 1; + let max_val_incl_null = *values_sorted.last().unwrap_or(&0u128) + 1; let mut deltas = get_deltas(values_sorted); - let mut amplitude_compact_space = max_val; - let mut amplitude_bits: u8 = (amplitude_compact_space as f64).log2().ceil() as u8; + let mut amplitude_compact_space = max_val_incl_null; + let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); let mut compact_space = CompactSpaceBuilder::new(); @@ -139,7 +125,7 @@ fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSp let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum(); // +1 for later added null value let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum + 1; - let amplitude_new_bits = (amplitude_new_compact_space as f64).log2().ceil() as u8; + let amplitude_new_bits = num_bits(amplitude_new_compact_space); if amplitude_bits == amplitude_new_bits { continue; } @@ -170,37 +156,13 @@ fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSp compact_space.add_blank(blank_start..=blank_end); } } - compact_space.add_blank(max_val..=u128::MAX); + if max_val_incl_null != u128::MAX { + compact_space.add_blank(max_val_incl_null..=u128::MAX); + } compact_space.finish() } -#[test] -fn compact_space_test() { - let ips = vec![ - 2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260, - ]; - let compact_space = get_compact_space(&ips, 11); - assert_eq!(compact_space.null_value, 5); - let amplitude = compact_space.amplitude_compact_space(); - assert_eq!(amplitude, 20); - assert_eq!(2, compact_space.to_compact(2).unwrap()); - assert_eq!(compact_space.to_compact(100).unwrap_err(), 0); - - let mut output = vec![]; - compact_space.serialize(&mut output).unwrap(); - - assert_eq!( - compact_space, - CompactSpace::deserialize(&mut &output[..]).unwrap() - ); - - for ip in &ips { - let compact = compact_space.to_compact(*ip).unwrap(); - assert_eq!(compact_space.unpack(compact), *ip); - } -} - #[derive(Debug, Clone, Eq, PartialEq)] struct CompactSpaceBuilder { covered_space: Vec>, @@ -667,6 +629,51 @@ mod tests { use super::*; + #[test] + fn test_binary_heap_pop_order() { + let mut deltas: BinaryHeap = BinaryHeap::new(); + deltas.push(BlankSizeAndPos { + blank_size: 10, + pos: 1, + }); + deltas.push(BlankSizeAndPos { + blank_size: 100, + pos: 10, + }); + deltas.push(BlankSizeAndPos { + blank_size: 1, + pos: 10, + }); + assert_eq!(deltas.pop().unwrap().blank_size, 100); + assert_eq!(deltas.pop().unwrap().blank_size, 10); + } + + #[test] + fn compact_space_test() { + let ips = vec![ + 2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260, + ]; + let compact_space = get_compact_space(&ips, 11); + assert_eq!(compact_space.null_value, 5); + let amplitude = compact_space.amplitude_compact_space(); + assert_eq!(amplitude, 20); + assert_eq!(2, compact_space.to_compact(2).unwrap()); + assert_eq!(compact_space.to_compact(100).unwrap_err(), 0); + + let mut output = vec![]; + compact_space.serialize(&mut output).unwrap(); + + assert_eq!( + compact_space, + CompactSpace::deserialize(&mut &output[..]).unwrap() + ); + + for ip in &ips { + let compact = compact_space.to_compact(*ip).unwrap(); + assert_eq!(compact_space.unpack(compact), *ip); + } + } + fn decode_all(data: OwnedBytes) -> Vec { let decompressor = CompactSpaceDecompressor::open(data).unwrap(); let mut u128_vals = Vec::new(); From 63b24200585a632f9b77dc073a2c4431f70d7a87 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 10:00:29 +0800 Subject: [PATCH 03/26] fix get_range change blank handling optimize blank collection fix off by one errors extend tests fix get_range dedupe values to save space add bench --- fastfield_codecs/Cargo.toml | 4 +- fastfield_codecs/src/compact_space.rs | 447 +++++++++++++++++--------- fastfield_codecs/src/main.rs | 131 +++++++- 3 files changed, 430 insertions(+), 152 deletions(-) diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index cddee15de..f66f91c96 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -16,6 +16,8 @@ prettytable-rs = {version="0.9.0", optional= true} rand = {version="0.8.3", optional= true} fastdivide = "0.4" log = "0.4" +itertools = { version="0.10.3", optional=true} +measure_time = { version="0.8.2", optional=true} [dev-dependencies] more-asserts = "0.3.0" @@ -23,7 +25,7 @@ proptest = "1.0.0" rand = "0.8.3" [features] -bin = ["prettytable-rs", "rand"] +bin = ["prettytable-rs", "rand", "itertools", "measure_time"] default = ["bin"] unstable = [] diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 25d484a62..091c01771 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -1,6 +1,6 @@ /// This codec takes a large number space (u128) and reduces it to a compact number space. /// -/// It will find spaces in the numer range. For example: +/// It will find spaces in the number range. For example: /// /// 100, 101, 102, 103, 104, 50000, 50001 /// could be mapped to @@ -13,6 +13,7 @@ use std::{ cmp::Ordering, collections::BinaryHeap, + convert::{TryFrom, TryInto}, io::{self, Write}, net::{IpAddr, Ipv6Addr}, ops::RangeInclusive, @@ -32,51 +33,82 @@ pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { u128::from_be_bytes(ip_addr_v6.octets()) } -const INTERVAL_COST_IN_BITS: usize = 64; +/// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of +/// blanks depends on the number of blanks. +/// +/// The number is taken by looking at a real dataset. It is optimized for larger datasets. +const COST_PER_BLANK_IN_BITS: usize = 36; -/// Store blank size and position. Order by blank size. +/// The range of a blank in value space. /// /// A blank is an unoccupied space in the data. -/// E.g. [100, 201] would have a `BlankAndPos{ pos: 0, blank_size: 100}` -#[derive(Debug, Eq, PartialEq, Clone, Copy)] -struct BlankSizeAndPos { - blank_size: u128, - /// Position in the sorted data. - pos: usize, +/// Ordered by size +/// +/// Use the try_into(), invalid ranges will be rejected. +/// +/// TODO: move to own module to force try_into +#[derive(Debug, Eq, PartialEq, Clone)] +struct BlankRange { + blank_range: RangeInclusive, } - -impl Ord for BlankSizeAndPos { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.blank_size.cmp(&other.blank_size) - } -} -impl PartialOrd for BlankSizeAndPos { - fn partial_cmp(&self, other: &Self) -> Option { - self.blank_size.partial_cmp(&other.blank_size) - } -} - -/// Put the deltas for the sorted values into a binary heap -fn get_deltas(values_sorted: &[u128]) -> BinaryHeap { - let mut prev_opt = None; - let mut deltas: BinaryHeap = BinaryHeap::new(); - for (pos, value) in values_sorted.iter().cloned().enumerate() { - let blank_size = if let Some(prev) = prev_opt { - value - prev +impl TryFrom> for BlankRange { + type Error = &'static str; + fn try_from(range: RangeInclusive) -> Result { + let blank_size = range.end().saturating_sub(*range.start()); + if blank_size < 2 { + Err("invalid range") } else { - value + 1 - }; - // skip too small deltas - if blank_size > 2 { - deltas.push(BlankSizeAndPos { blank_size, pos }); + Ok(BlankRange { blank_range: range }) } - prev_opt = Some(value); } - deltas +} +impl BlankRange { + fn blank_size(&self) -> u128 { + self.blank_range.end() - self.blank_range.start() + } +} + +impl Ord for BlankRange { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.blank_size().cmp(&other.blank_size()) + } +} +impl PartialOrd for BlankRange { + fn partial_cmp(&self, other: &Self) -> Option { + self.blank_size().partial_cmp(&other.blank_size()) + } +} + +/// Put the blanks for the sorted values into a binary heap +fn get_blanks(values_sorted: &[u128]) -> BinaryHeap { + let mut blanks: BinaryHeap = BinaryHeap::new(); + let mut add_range = |blank_range: RangeInclusive| { + let blank_range: Result = blank_range.try_into(); + if let Ok(blank_range) = blank_range { + blanks.push(blank_range); + } + }; + for values in values_sorted.windows(2) { + let blank_range = values[0] + 1..=values[1] - 1; + add_range(blank_range); + } + if let Some(first_val) = values_sorted.first().filter(|first_val| **first_val != 0) { + let blank_range = 0..=first_val - 1; + add_range(blank_range); + } + + if let Some(last_val) = values_sorted + .last() + .filter(|last_val| **last_val != u128::MAX) + { + let blank_range = last_val + 1..=u128::MAX; + add_range(blank_range); + } + blanks } struct BlankCollector { - blanks: Vec, + blanks: Vec, staged_blanks_sum: u128, } impl BlankCollector { @@ -86,11 +118,11 @@ impl BlankCollector { staged_blanks_sum: 0, } } - fn stage_blank(&mut self, blank: BlankSizeAndPos) { - self.staged_blanks_sum += blank.blank_size - 1; + fn stage_blank(&mut self, blank: BlankRange) { + self.staged_blanks_sum += blank.blank_size(); self.blanks.push(blank); } - fn drain(&mut self) -> std::vec::Drain<'_, BlankSizeAndPos> { + fn drain(&mut self) -> std::vec::Drain<'_, BlankRange> { self.staged_blanks_sum = 0; self.blanks.drain(..) } @@ -101,26 +133,31 @@ impl BlankCollector { self.blanks.len() } } - fn num_bits(val: u128) -> u8 { (128u32 - val.leading_zeros()) as u8 } /// Will collect blanks and add them to compact space if more bits are saved than cost from /// metadata. -fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSpace { - let max_val_incl_null = *values_sorted.last().unwrap_or(&0u128) + 1; - let mut deltas = get_deltas(values_sorted); - let mut amplitude_compact_space = max_val_incl_null; +fn get_compact_space( + values_deduped_sorted: &[u128], + total_num_values: usize, + cost_per_blank: usize, +) -> CompactSpace { + let mut blanks = get_blanks(values_deduped_sorted); + let mut amplitude_compact_space = u128::MAX; let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); let mut compact_space = CompactSpaceBuilder::new(); + if values_deduped_sorted.is_empty() { + return compact_space.finish(); + } let mut blank_collector = BlankCollector::new(); // We will stage blanks until they reduce the compact space by 1 bit. // Binary heap to process the gaps by their size - while let Some(delta_and_pos) = deltas.pop() { - blank_collector.stage_blank(delta_and_pos); + while let Some(blank_range) = blanks.pop() { + blank_collector.stage_blank(blank_range); let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum(); // +1 for later added null value @@ -129,11 +166,13 @@ fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSp if amplitude_bits == amplitude_new_bits { continue; } - let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * values_sorted.len(); + let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * total_num_values; + // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only, + // when amplitude_new_bits changes let cost = blank_collector.num_blanks() * cost_per_blank; if cost >= saved_bits { - // Continue here, since although we walk over the deltas by size, - // we can potentially save a lot at the last bits, which are smaller deltas + // Continue here, since although we walk over the blanks by size, + // we can potentially save a lot at the last bits, which are smaller blanks // // E.g. if the first range reduces the compact space by 1000 from 2000 to 1000, which // saves 11-10=1 bit and the next range reduces the compact space by 950 to @@ -143,21 +182,23 @@ fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSp amplitude_compact_space = amplitude_new_compact_space; amplitude_bits = amplitude_new_bits; - for pos in blank_collector - .drain() - .map(|blank_and_pos| blank_and_pos.pos) - { - let blank_end = values_sorted[pos] - 1; - let blank_start = if pos == 0 { - 0 - } else { - values_sorted[pos - 1] + 1 - }; - compact_space.add_blank(blank_start..=blank_end); - } + compact_space.add_blanks(blank_collector.drain().map(|blank| blank.blank_range)); } - if max_val_incl_null != u128::MAX { - compact_space.add_blank(max_val_incl_null..=u128::MAX); + + // special case, when we don't collected any blanks because: + // * the data is empty + // * the algorithm did decide it's not worth the cost, which can be the case for single values + // + // We drain one collected blank unconditionally, so the empty case is reserved for empty + // data, and therefore empty compact_space means the data is empty and no data is covered + // (conversely to all data) and we can assign null to it. + if compact_space.is_empty() { + compact_space.add_blanks( + blank_collector + .drain() + .map(|blank| blank.blank_range) + .take(1), + ); } compact_space.finish() @@ -165,59 +206,79 @@ fn get_compact_space(values_sorted: &[u128], cost_per_blank: usize) -> CompactSp #[derive(Debug, Clone, Eq, PartialEq)] struct CompactSpaceBuilder { - covered_space: Vec>, + blanks: Vec>, } impl CompactSpaceBuilder { /// Creates a new compact space builder which will initially cover the whole space. fn new() -> Self { - Self { - covered_space: vec![0..=u128::MAX], - } + Self { blanks: vec![] } } - /// Will extend the first range and assign the null value to it. - fn assign_and_return_null(&mut self) -> u128 { - self.covered_space[0] = *self.covered_space[0].start()..=*self.covered_space[0].end() + 1; - *self.covered_space[0].end() + /// Assumes that repeated add_blank calls don't overlap and are not adjacent, + /// e.g. [3..=5, 5..=10] is not allowed + /// + /// Both of those assumptions are true when blanks are produced from sorted values. + fn add_blanks(&mut self, blank: impl Iterator>) { + self.blanks.extend(blank); } - /// Assumes that repeated add_blank calls don't overlap, which will be the case on sorted - /// values. - fn add_blank(&mut self, blank: RangeInclusive) { - let position = self - .covered_space - .iter() - .position(|range| range.start() <= blank.start() && range.end() >= blank.end()); - if let Some(position) = position { - let old_range = self.covered_space.remove(position); - // Exact match, just remove - if old_range == blank { - return; - } - let new_range_end = blank.end().saturating_add(1)..=*old_range.end(); - if old_range.start() == blank.start() { - self.covered_space.insert(position, new_range_end); - return; - } - let new_range_start = *old_range.start()..=blank.start().saturating_sub(1); - if old_range.end() == blank.end() { - self.covered_space.insert(position, new_range_start); - return; - } - self.covered_space.insert(position, new_range_end); - self.covered_space.insert(position, new_range_start); - } + fn is_empty(&self) -> bool { + self.blanks.is_empty() } + + /// Convert blanks to covered space and assign null value fn finish(mut self) -> CompactSpace { - let null_value = self.assign_and_return_null(); + // sort by start since ranges are not allowed to overlap + self.blanks.sort_by_key(|blank| *blank.start()); + + // Between the blanks + let mut covered_space = self + .blanks + .windows(2) + .map(|blanks| { + assert!( + blanks[0].end() < blanks[1].start(), + "overlapping or adjacent ranges detected" + ); + *blanks[0].end() + 1..=*blanks[1].start() - 1 + }) + .collect::>(); + + // Outside the blanks + if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) { + if *first_blank_start != 0 { + covered_space.insert(0, 0..=first_blank_start - 1); + } + } + + if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) { + if *last_blank_end != u128::MAX { + covered_space.push(last_blank_end + 1..=u128::MAX); + } + } + + // Extend the first range and assign the null value to it. + let null_value = if let Some(first_covered_space) = covered_space.first_mut() { + // in case the first covered space ends at u128::MAX, assign null to the beginning + if *first_covered_space.end() == u128::MAX { + *first_covered_space = first_covered_space.start() - 1..=*first_covered_space.end(); + *first_covered_space.start() + } else { + *first_covered_space = *first_covered_space.start()..=first_covered_space.end() + 1; + *first_covered_space.end() + } + } else { + covered_space.push(0..=0); // empty data case + 0u128 + }; let mut compact_start: u64 = 0; - let mut ranges_and_compact_start = vec![]; - for cov in self.covered_space { - let covered_range_len = cov.end() - cov.start(); + let mut ranges_and_compact_start = Vec::with_capacity(covered_space.len()); + for cov in covered_space { + let covered_range_len = cov.end() - cov.start() + 1; // e.g. 0..=1 covered space 1-0+1= 2 ranges_and_compact_start.push((cov, compact_start)); - compact_start += covered_range_len as u64 + 1; + compact_start += covered_range_len as u64; } CompactSpace { ranges_and_compact_start, @@ -239,12 +300,12 @@ impl BinarySerializable for CompactSpace { let mut prev_value = 0; for (value_range, _compact) in &self.ranges_and_compact_start { - let delta = value_range.start() - prev_value; - VIntU128(delta).serialize(writer)?; + let blank_delta_start = value_range.start() - prev_value; + VIntU128(blank_delta_start).serialize(writer)?; prev_value = *value_range.start(); - let delta = value_range.end() - prev_value; - VIntU128(delta).serialize(writer)?; + let blank_delta_end = value_range.end() - prev_value; + VIntU128(blank_delta_end).serialize(writer)?; prev_value = *value_range.end(); } @@ -258,17 +319,17 @@ impl BinarySerializable for CompactSpace { let mut value = 0u128; let mut compact = 0u64; for _ in 0..num_values { - let delta = VIntU128::deserialize(reader)?.0; - value += delta; - let value_start = value; + let blank_delta_start = VIntU128::deserialize(reader)?.0; + value += blank_delta_start; + let blank_start = value; - let delta = VIntU128::deserialize(reader)?.0; - value += delta; - let value_end = value; + let blank_delta_end = VIntU128::deserialize(reader)?.0; + value += blank_delta_end; + let blank_end = value; - let compact_delta = value_end - value_start + 1; + let compact_delta = blank_end - blank_start + 1; - ranges_and_compact_start.push((value_start..=value_end, compact)); + ranges_and_compact_start.push((blank_start..=blank_end, compact)); compact += compact_delta as u64; } @@ -290,12 +351,12 @@ impl CompactSpace { } /// Returns either Ok(the value in the compact space) or if it is outside the compact space the - /// Err(position on the next larger range above the value) + /// Err(position where it would be inserted) fn to_compact(&self, value: u128) -> Result { self.ranges_and_compact_start .binary_search_by(|probe| { let value_range = &probe.0; - if *value_range.start() <= value && *value_range.end() >= value { + if value_range.contains(&value) { return Ordering::Equal; } else if value < *value_range.start() { return Ordering::Greater; @@ -308,7 +369,6 @@ impl CompactSpace { let (range, compact_start) = &self.ranges_and_compact_start[pos]; compact_start + (value - range.start()) as u64 }) - .map_err(|pos| pos - 1) } /// Unpacks a value from compact space u64 to u128 space @@ -347,12 +407,17 @@ impl CompactSpaceCompressor { /// Taking the vals as Vec may cost a lot of memory. /// It is used to sort the vals. /// - /// Less memory alternative: We could just store the index (u32), and use that as sorting. + /// The lower memory alternative to just store the index (u32) and use that as sorting may be an + /// issue for the merge case, where random access is more expensive. /// /// TODO: Should we take Option here? (better api, but 24bytes instead 16 per element) pub fn train_from(mut vals: Vec) -> Self { + let total_num_values = vals.len(); // TODO: Null values should be here too vals.sort(); - train(&vals) + // We don't care for duplicates + vals.dedup(); + vals.shrink_to_fit(); + train(&vals, total_num_values) } fn to_compact(&self, value: u128) -> u64 { @@ -375,7 +440,7 @@ impl CompactSpaceCompressor { self.compress_into(vals, &mut output)?; Ok(output) } - /// TODO: Should we take Option here? Other wise the caller has to replace None with + /// TODO: Should we take Option here? Otherwise the caller has to replace None with /// `self.null_value()` pub fn compress_into( self, @@ -397,8 +462,8 @@ impl CompactSpaceCompressor { } } -fn train(values_sorted: &[u128]) -> CompactSpaceCompressor { - let compact_space = get_compact_space(values_sorted, INTERVAL_COST_IN_BITS); +fn train(values_sorted: &[u128], total_num_values: usize) -> CompactSpaceCompressor { + let compact_space = get_compact_space(values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); let null_value = compact_space.null_value; let null_compact_space = compact_space .to_compact(null_value) @@ -427,10 +492,7 @@ fn train(values_sorted: &[u128]) -> CompactSpaceCompressor { }; let max_value = *values_sorted.last().unwrap_or(&0u128).max(&null_value); - assert_eq!( - compressor.to_compact(max_value) + 1, - amplitude_compact_space as u64 - ); + assert!(compressor.to_compact(max_value) < amplitude_compact_space as u64); compressor } @@ -542,28 +604,37 @@ impl CompactSpaceDecompressor { /// Comparing on compact space: 1.2 GElements/s /// /// Comparing on original space: .06 GElements/s (not completely optimized) - pub fn get_range(&self, range: RangeInclusive) -> Vec { + pub fn get_range(&self, mut range: RangeInclusive) -> Vec { + if range.start() > range.end() { + range = *range.end()..=*range.start(); + } let from_value = *range.start(); let to_value = *range.end(); assert!(to_value >= from_value); let compact_from = self.to_compact(from_value); let compact_to = self.to_compact(to_value); + // Quick return, if both ranges fall into the same non-mapped space, the range can't cover // any values, so we can early exit match (compact_to, compact_from) { - (Err(pos1), Err(pos2)) if pos1 == pos2 => return vec![], + (Err(pos1), Err(pos2)) if pos1 == pos2 => return Vec::new(), _ => {} } let compact_from = compact_from.unwrap_or_else(|pos| { + // Correctness: Out of bounds, if this value is Err(last_index + 1), we early exit, + // since the to_value also mapps into the same non-mapped space let range_and_compact_start = self.params.compact_space.get_range_and_compact_start(pos); - let compact_end = range_and_compact_start.1 - + (range_and_compact_start.0.end() - range_and_compact_start.0.start()) as u64; - compact_end + 1 + range_and_compact_start.1 }); // If there is no compact space, we go to the closest upperbound compact space let compact_to = compact_to.unwrap_or_else(|pos| { + // Correctness: Overflow, if this value is Err(0), we early exit, + // since the from_value also mapps into the same non-mapped space + + // get end of previous range + let pos = pos - 1; let range_and_compact_start = self.params.compact_space.get_range_and_compact_start(pos); let compact_end = range_and_compact_start.1 @@ -572,7 +643,7 @@ impl CompactSpaceDecompressor { }); let range = compact_from..=compact_to; - let mut positions = vec![]; + let mut positions = Vec::new(); for (pos, compact_value) in self .iter_compact() @@ -631,36 +702,34 @@ mod tests { #[test] fn test_binary_heap_pop_order() { - let mut deltas: BinaryHeap = BinaryHeap::new(); - deltas.push(BlankSizeAndPos { - blank_size: 10, - pos: 1, + let mut blanks: BinaryHeap = BinaryHeap::new(); + blanks.push(BlankRange { + blank_range: 0..=10, }); - deltas.push(BlankSizeAndPos { - blank_size: 100, - pos: 10, + blanks.push(BlankRange { + blank_range: 100..=200, }); - deltas.push(BlankSizeAndPos { - blank_size: 1, - pos: 10, + blanks.push(BlankRange { + blank_range: 100..=110, }); - assert_eq!(deltas.pop().unwrap().blank_size, 100); - assert_eq!(deltas.pop().unwrap().blank_size, 10); + assert_eq!(blanks.pop().unwrap().blank_size(), 100); + assert_eq!(blanks.pop().unwrap().blank_size(), 10); } #[test] fn compact_space_test() { - let ips = vec![ + let ips = &[ 2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260, ]; - let compact_space = get_compact_space(&ips, 11); + let compact_space = get_compact_space(ips, ips.len(), 11); assert_eq!(compact_space.null_value, 5); let amplitude = compact_space.amplitude_compact_space(); assert_eq!(amplitude, 20); assert_eq!(2, compact_space.to_compact(2).unwrap()); - assert_eq!(compact_space.to_compact(100).unwrap_err(), 0); + assert_eq!(3, compact_space.to_compact(3).unwrap()); + assert_eq!(compact_space.to_compact(100).unwrap_err(), 1); - let mut output = vec![]; + let mut output: Vec = Vec::new(); compact_space.serialize(&mut output).unwrap(); assert_eq!( @@ -668,12 +737,21 @@ mod tests { CompactSpace::deserialize(&mut &output[..]).unwrap() ); - for ip in &ips { + for ip in ips { let compact = compact_space.to_compact(*ip).unwrap(); assert_eq!(compact_space.unpack(compact), *ip); } } + #[test] + fn compact_space_amplitude_test() { + let ips = &[100000u128, 1000000]; + let compact_space = get_compact_space(ips, ips.len(), 1); + assert_eq!(compact_space.null_value, 100001); + let amplitude = compact_space.amplitude_compact_space(); + assert_eq!(amplitude, 3); + } + fn decode_all(data: OwnedBytes) -> Vec { let decompressor = CompactSpaceDecompressor::open(data).unwrap(); let mut u128_vals = Vec::new(); @@ -717,6 +795,7 @@ mod tests { let positions = decomp.get_range(0..=3); assert_eq!(positions, vec![0, 2]); assert_eq!(decomp.get_range(99999u128..=99999u128), vec![3]); + assert_eq!(decomp.get_range(99999u128..=100000u128), vec![3, 4]); assert_eq!(decomp.get_range(99998u128..=100000u128), vec![3, 4]); assert_eq!(decomp.get_range(99998u128..=99999u128), vec![3]); assert_eq!(decomp.get_range(99998u128..=99998u128), vec![]); @@ -770,6 +849,64 @@ mod tests { assert_eq!(positions, vec![]); let positions = decomp.get_range(2..=2); assert_eq!(positions, vec![1]); + + let positions = decomp.get_range(2..=3); + assert_eq!(positions, vec![1]); + + let positions = decomp.get_range(1..=3); + assert_eq!(positions, vec![1]); + + let positions = decomp.get_range(2..=3); + assert_eq!(positions, vec![1]); + + let positions = decomp.get_range(3..=3); + assert_eq!(positions, vec![]); + } + + #[test] + fn test_range_3() { + let vals = &[ + 200u128, + 201, + 202, + 203, + 204, + 204, + 206, + 207, + 208, + 209, + 210, + 1_000_000, + 5_000_000_000, + ]; + let compressor = CompactSpaceCompressor::train_from(vals.to_vec()); + // let vals = vec![compressor.null_value(), 2u128]; + let data = compressor.compress(vals.iter().cloned()).unwrap(); + let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); + + assert_eq!(decomp.get_range(199..=200), vec![0]); + assert_eq!(decomp.get_range(199..=201), vec![0, 1]); + assert_eq!(decomp.get_range(200..=200), vec![0]); + assert_eq!(decomp.get_range(1_000_000..=1_000_000), vec![11]); + } + + #[test] + fn test_bug1() { + let vals = &[9223372036854775806]; + let _data = test_aux_vals(vals); + } + + #[test] + fn test_bug2() { + let vals = &[340282366920938463463374607431768211455u128]; + let _data = test_aux_vals(vals); + } + + #[test] + fn test_bug3() { + let vals = &[340282366920938463463374607431768211454]; + let _data = test_aux_vals(vals); } #[test] @@ -779,10 +916,20 @@ mod tests { } use proptest::prelude::*; + fn num_strategy() -> impl Strategy { + prop_oneof![ + 1 => prop::num::u128::ANY.prop_map(|num| u128::MAX - (num % 10) ), + 1 => prop::num::u128::ANY.prop_map(|num| i64::MAX as u128 + 5 - (num % 10) ), + 1 => prop::num::u128::ANY.prop_map(|num| i128::MAX as u128 + 5 - (num % 10) ), + 1 => prop::num::u128::ANY.prop_map(|num| num % 10 ), + 20 => prop::num::u128::ANY, + ] + } + proptest! { #[test] - fn compress_decompress_random(vals in proptest::collection::vec(any::() + fn compress_decompress_random(vals in proptest::collection::vec(num_strategy() , 1..1000)) { let _data = test_aux_vals(&vals); } diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 082d2c4bc..439d9ecc6 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -1,9 +1,138 @@ #[macro_use] extern crate prettytable; -use fastfield_codecs::{Column, FastFieldCodecType, VecColumn}; +use std::collections::HashSet; +use std::env; +use std::io::BufRead; +use std::net::{IpAddr, Ipv6Addr}; +use std::str::FromStr; + +use fastfield_codecs::{ + Column, CompactSpaceCompressor, FastFieldCodecType, FastFieldStats, VecColumn, +}; +use itertools::Itertools; use prettytable::{Cell, Row, Table}; +fn print_set_stats(ip_addrs: &[u128]) { + println!("NumIps\t{}", ip_addrs.len()); + let ip_addr_set: HashSet = ip_addrs.iter().cloned().collect(); + println!("NumUniqueIps\t{}", ip_addr_set.len()); + let ratio_unique = ip_addr_set.len() as f64 / ip_addrs.len() as f64; + println!("RatioUniqueOverTotal\t{ratio_unique:.4}"); + + // histogram + let mut ip_addrs = ip_addrs.to_vec(); + ip_addrs.sort(); + let mut cnts: Vec = ip_addrs + .into_iter() + .dedup_with_count() + .map(|(cnt, _)| cnt) + .collect(); + cnts.sort(); + + let top_256_cnt: usize = cnts.iter().rev().take(256).sum(); + let top_128_cnt: usize = cnts.iter().rev().take(128).sum(); + let top_64_cnt: usize = cnts.iter().rev().take(64).sum(); + let top_8_cnt: usize = cnts.iter().rev().take(8).sum(); + let total: usize = cnts.iter().sum(); + + println!("{}", total); + println!("{}", top_256_cnt); + println!("{}", top_128_cnt); + println!("Percentage Top8 {:02}", top_8_cnt as f32 / total as f32); + println!("Percentage Top64 {:02}", top_64_cnt as f32 / total as f32); + println!("Percentage Top128 {:02}", top_128_cnt as f32 / total as f32); + println!("Percentage Top256 {:02}", top_256_cnt as f32 / total as f32); + + let mut cnts: Vec<(usize, usize)> = cnts.into_iter().dedup_with_count().collect(); + cnts.sort_by(|a, b| { + if a.1 == b.1 { + a.0.cmp(&b.0) + } else { + b.1.cmp(&a.1) + } + }); + + // println!("\n\n----\nIP Address histogram"); + // println!("IPAddrCount\tFrequency"); + // for (ip_addr_count, times) in cnts { + // println!("{}\t{}", ip_addr_count, times); + //} +} + +fn ip_dataset() -> Vec { + let mut ip_addr_v4 = 0; + + let stdin = std::io::stdin(); + let ip_addrs: Vec = stdin + .lock() + .lines() + .flat_map(|line| { + let line = line.unwrap(); + let line = line.trim(); + let ip_addr = IpAddr::from_str(line.trim()).ok()?; + if ip_addr.is_ipv4() { + ip_addr_v4 += 1; + } + let ip_addr_v6: Ipv6Addr = match ip_addr { + IpAddr::V4(v4) => v4.to_ipv6_mapped(), + IpAddr::V6(v6) => v6, + }; + Some(ip_addr_v6) + }) + .map(|ip_v6| u128::from_be_bytes(ip_v6.octets())) + .collect(); + + println!("IpAddrsAny\t{}", ip_addrs.len()); + println!("IpAddrsV4\t{}", ip_addr_v4); + + ip_addrs +} + +fn bench_ip() { + let dataset = ip_dataset(); + print_set_stats(&dataset); + + // Chunks + { + let mut data = vec![]; + for dataset in dataset.chunks(50_000) { + let compressor = CompactSpaceCompressor::train_from(dataset.to_vec()); + compressor + .compress_into(dataset.iter().cloned(), &mut data) + .unwrap(); + } + let compression = data.len() as f64 / (dataset.len() * 16) as f64; + println!("Compression 50_000 chunks {:.4}", compression); + println!( + "Num Bits per elem {:.2}", + (data.len() * 8) as f32 / dataset.len() as f32 + ); + } + + let compressor = CompactSpaceCompressor::train_from(dataset.to_vec()); + let data = compressor.compress(dataset.iter().cloned()).unwrap(); + + let compression = data.len() as f64 / (dataset.len() * 16) as f64; + println!("Compression {:.2}", compression); + println!( + "Num Bits per elem {:.2}", + (data.len() * 8) as f32 / dataset.len() as f32 + ); + + // let decompressor = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); + // for i in 11100..11150 { + // print_time!("get range"); + // let doc_values = decompressor.get_range(dataset[i]..=dataset[i]); + // println!("{:?}", doc_values.len()); + //} +} + fn main() { + if env::args().nth(1).unwrap() == "bench_ip" { + bench_ip(); + return; + } + let mut table = Table::new(); // Add a row per time From 762e662bfd2c7224a8e30e18fbe61f5b31ee9f42 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 13:00:25 +0800 Subject: [PATCH 04/26] extend proptest for get_range --- fastfield_codecs/src/compact_space.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 091c01771..f62b99322 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -752,24 +752,29 @@ mod tests { assert_eq!(amplitude, 3); } - fn decode_all(data: OwnedBytes) -> Vec { + fn test_all(data: OwnedBytes, expected: &[u128]) { let decompressor = CompactSpaceDecompressor::open(data).unwrap(); - let mut u128_vals = Vec::new(); for idx in 0..decompressor.params.num_vals as usize { + let expected_val = expected[idx]; let val = decompressor.get(idx as u64); - if let Some(val) = val { - u128_vals.push(val); - } + assert_eq!(val, Some(expected_val)); + let positions = decompressor.get_range(expected_val.saturating_sub(1)..=expected_val); + assert!(positions.contains(&(idx as u64))); + let positions = decompressor.get_range(expected_val..=expected_val); + assert!(positions.contains(&(idx as u64))); + let positions = decompressor.get_range(expected_val..=expected_val.saturating_add(1)); + assert!(positions.contains(&(idx as u64))); + let positions = decompressor + .get_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1)); + assert!(positions.contains(&(idx as u64))); } - u128_vals } fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { let compressor = CompactSpaceCompressor::train_from(u128_vals.to_vec()); let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); let data = OwnedBytes::new(data); - let decoded_val = decode_all(data.clone()); - assert_eq!(&decoded_val, u128_vals); + test_all(data.clone(), u128_vals); data } @@ -927,6 +932,7 @@ mod tests { } proptest! { + #![proptest_config(ProptestConfig::with_cases(10))] #[test] fn compress_decompress_random(vals in proptest::collection::vec(num_strategy() From df32ee2df288125c47d3629fa7ada485baecb67c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 15:10:40 +0800 Subject: [PATCH 05/26] refactor, use BTreeSet for sorted deduped values --- fastfield_codecs/Cargo.toml | 2 +- fastfield_codecs/src/compact_space.rs | 348 +++--------------- .../src/compact_space/blank_range.rs | 42 +++ .../src/compact_space/build_compact_space.rs | 237 ++++++++++++ fastfield_codecs/src/main.rs | 35 +- 5 files changed, 342 insertions(+), 322 deletions(-) create mode 100644 fastfield_codecs/src/compact_space/blank_range.rs create mode 100644 fastfield_codecs/src/compact_space/build_compact_space.rs diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index f66f91c96..2ee7e0093 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -16,7 +16,7 @@ prettytable-rs = {version="0.9.0", optional= true} rand = {version="0.8.3", optional= true} fastdivide = "0.4" log = "0.4" -itertools = { version="0.10.3", optional=true} +itertools = { version = "0.10.3", optional = true } measure_time = { version="0.8.2", optional=true} [dev-dependencies] diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index f62b99322..c8184913d 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -12,8 +12,7 @@ /// The codec is created to compress ip addresses, but may be employed in other use cases. use std::{ cmp::Ordering, - collections::BinaryHeap, - convert::{TryFrom, TryInto}, + collections::BTreeSet, io::{self, Write}, net::{IpAddr, Ipv6Addr}, ops::RangeInclusive, @@ -24,6 +23,10 @@ use ownedbytes::OwnedBytes; use tantivy_bitpacker::{self, BitPacker, BitUnpacker}; use crate::column::{ColumnV2, ColumnV2Ext}; +use crate::compact_space::build_compact_space::get_compact_space; + +mod blank_range; +mod build_compact_space; pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { let ip_addr_v6: Ipv6Addr = match ip_addr { @@ -39,256 +42,8 @@ pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { /// The number is taken by looking at a real dataset. It is optimized for larger datasets. const COST_PER_BLANK_IN_BITS: usize = 36; -/// The range of a blank in value space. -/// -/// A blank is an unoccupied space in the data. -/// Ordered by size -/// -/// Use the try_into(), invalid ranges will be rejected. -/// -/// TODO: move to own module to force try_into -#[derive(Debug, Eq, PartialEq, Clone)] -struct BlankRange { - blank_range: RangeInclusive, -} -impl TryFrom> for BlankRange { - type Error = &'static str; - fn try_from(range: RangeInclusive) -> Result { - let blank_size = range.end().saturating_sub(*range.start()); - if blank_size < 2 { - Err("invalid range") - } else { - Ok(BlankRange { blank_range: range }) - } - } -} -impl BlankRange { - fn blank_size(&self) -> u128 { - self.blank_range.end() - self.blank_range.start() - } -} - -impl Ord for BlankRange { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.blank_size().cmp(&other.blank_size()) - } -} -impl PartialOrd for BlankRange { - fn partial_cmp(&self, other: &Self) -> Option { - self.blank_size().partial_cmp(&other.blank_size()) - } -} - -/// Put the blanks for the sorted values into a binary heap -fn get_blanks(values_sorted: &[u128]) -> BinaryHeap { - let mut blanks: BinaryHeap = BinaryHeap::new(); - let mut add_range = |blank_range: RangeInclusive| { - let blank_range: Result = blank_range.try_into(); - if let Ok(blank_range) = blank_range { - blanks.push(blank_range); - } - }; - for values in values_sorted.windows(2) { - let blank_range = values[0] + 1..=values[1] - 1; - add_range(blank_range); - } - if let Some(first_val) = values_sorted.first().filter(|first_val| **first_val != 0) { - let blank_range = 0..=first_val - 1; - add_range(blank_range); - } - - if let Some(last_val) = values_sorted - .last() - .filter(|last_val| **last_val != u128::MAX) - { - let blank_range = last_val + 1..=u128::MAX; - add_range(blank_range); - } - blanks -} - -struct BlankCollector { - blanks: Vec, - staged_blanks_sum: u128, -} -impl BlankCollector { - fn new() -> Self { - Self { - blanks: vec![], - staged_blanks_sum: 0, - } - } - fn stage_blank(&mut self, blank: BlankRange) { - self.staged_blanks_sum += blank.blank_size(); - self.blanks.push(blank); - } - fn drain(&mut self) -> std::vec::Drain<'_, BlankRange> { - self.staged_blanks_sum = 0; - self.blanks.drain(..) - } - fn staged_blanks_sum(&self) -> u128 { - self.staged_blanks_sum - } - fn num_blanks(&self) -> usize { - self.blanks.len() - } -} -fn num_bits(val: u128) -> u8 { - (128u32 - val.leading_zeros()) as u8 -} - -/// Will collect blanks and add them to compact space if more bits are saved than cost from -/// metadata. -fn get_compact_space( - values_deduped_sorted: &[u128], - total_num_values: usize, - cost_per_blank: usize, -) -> CompactSpace { - let mut blanks = get_blanks(values_deduped_sorted); - let mut amplitude_compact_space = u128::MAX; - let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); - - let mut compact_space = CompactSpaceBuilder::new(); - if values_deduped_sorted.is_empty() { - return compact_space.finish(); - } - - let mut blank_collector = BlankCollector::new(); - // We will stage blanks until they reduce the compact space by 1 bit. - // Binary heap to process the gaps by their size - while let Some(blank_range) = blanks.pop() { - blank_collector.stage_blank(blank_range); - - let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum(); - // +1 for later added null value - let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum + 1; - let amplitude_new_bits = num_bits(amplitude_new_compact_space); - if amplitude_bits == amplitude_new_bits { - continue; - } - let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * total_num_values; - // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only, - // when amplitude_new_bits changes - let cost = blank_collector.num_blanks() * cost_per_blank; - if cost >= saved_bits { - // Continue here, since although we walk over the blanks by size, - // we can potentially save a lot at the last bits, which are smaller blanks - // - // E.g. if the first range reduces the compact space by 1000 from 2000 to 1000, which - // saves 11-10=1 bit and the next range reduces the compact space by 950 to - // 50, which saves 10-6=4 bit - continue; - } - - amplitude_compact_space = amplitude_new_compact_space; - amplitude_bits = amplitude_new_bits; - compact_space.add_blanks(blank_collector.drain().map(|blank| blank.blank_range)); - } - - // special case, when we don't collected any blanks because: - // * the data is empty - // * the algorithm did decide it's not worth the cost, which can be the case for single values - // - // We drain one collected blank unconditionally, so the empty case is reserved for empty - // data, and therefore empty compact_space means the data is empty and no data is covered - // (conversely to all data) and we can assign null to it. - if compact_space.is_empty() { - compact_space.add_blanks( - blank_collector - .drain() - .map(|blank| blank.blank_range) - .take(1), - ); - } - - compact_space.finish() -} - #[derive(Debug, Clone, Eq, PartialEq)] -struct CompactSpaceBuilder { - blanks: Vec>, -} - -impl CompactSpaceBuilder { - /// Creates a new compact space builder which will initially cover the whole space. - fn new() -> Self { - Self { blanks: vec![] } - } - - /// Assumes that repeated add_blank calls don't overlap and are not adjacent, - /// e.g. [3..=5, 5..=10] is not allowed - /// - /// Both of those assumptions are true when blanks are produced from sorted values. - fn add_blanks(&mut self, blank: impl Iterator>) { - self.blanks.extend(blank); - } - - fn is_empty(&self) -> bool { - self.blanks.is_empty() - } - - /// Convert blanks to covered space and assign null value - fn finish(mut self) -> CompactSpace { - // sort by start since ranges are not allowed to overlap - self.blanks.sort_by_key(|blank| *blank.start()); - - // Between the blanks - let mut covered_space = self - .blanks - .windows(2) - .map(|blanks| { - assert!( - blanks[0].end() < blanks[1].start(), - "overlapping or adjacent ranges detected" - ); - *blanks[0].end() + 1..=*blanks[1].start() - 1 - }) - .collect::>(); - - // Outside the blanks - if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) { - if *first_blank_start != 0 { - covered_space.insert(0, 0..=first_blank_start - 1); - } - } - - if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) { - if *last_blank_end != u128::MAX { - covered_space.push(last_blank_end + 1..=u128::MAX); - } - } - - // Extend the first range and assign the null value to it. - let null_value = if let Some(first_covered_space) = covered_space.first_mut() { - // in case the first covered space ends at u128::MAX, assign null to the beginning - if *first_covered_space.end() == u128::MAX { - *first_covered_space = first_covered_space.start() - 1..=*first_covered_space.end(); - *first_covered_space.start() - } else { - *first_covered_space = *first_covered_space.start()..=first_covered_space.end() + 1; - *first_covered_space.end() - } - } else { - covered_space.push(0..=0); // empty data case - 0u128 - }; - - let mut compact_start: u64 = 0; - let mut ranges_and_compact_start = Vec::with_capacity(covered_space.len()); - for cov in covered_space { - let covered_range_len = cov.end() - cov.start() + 1; // e.g. 0..=1 covered space 1-0+1= 2 - ranges_and_compact_start.push((cov, compact_start)); - compact_start += covered_range_len as u64; - } - CompactSpace { - ranges_and_compact_start, - null_value, - } - } -} - -#[derive(Debug, Clone, Eq, PartialEq)] -struct CompactSpace { +pub struct CompactSpace { ranges_and_compact_start: Vec<(RangeInclusive, u64)>, pub null_value: u128, } @@ -400,24 +155,19 @@ pub struct IPCodecParams { } impl CompactSpaceCompressor { - pub fn null_value(&self) -> u128 { - self.params.null_value + pub fn null_value_compact_space(&self) -> u64 { + self.params.null_value_compact_space } - /// Taking the vals as Vec may cost a lot of memory. - /// It is used to sort the vals. - /// - /// The lower memory alternative to just store the index (u32) and use that as sorting may be an - /// issue for the merge case, where random access is more expensive. - /// - /// TODO: Should we take Option here? (better api, but 24bytes instead 16 per element) - pub fn train_from(mut vals: Vec) -> Self { - let total_num_values = vals.len(); // TODO: Null values should be here too - vals.sort(); - // We don't care for duplicates - vals.dedup(); - vals.shrink_to_fit(); - train(&vals, total_num_values) + /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. + pub fn train_from( + vals: impl Iterator, + total_num_values_incl_nulls: usize, + ) -> Self { + let mut tree = BTreeSet::new(); + tree.extend(vals); + assert!(tree.len() <= total_num_values_incl_nulls); + train(&tree, total_num_values_incl_nulls) } fn to_compact(&self, value: u128) -> u64 { @@ -435,22 +185,25 @@ impl CompactSpaceCompressor { Ok(()) } - pub fn compress(self, vals: impl Iterator) -> io::Result> { + pub fn compress(self, vals: impl Iterator>) -> io::Result> { let mut output = vec![]; self.compress_into(vals, &mut output)?; Ok(output) } - /// TODO: Should we take Option here? Otherwise the caller has to replace None with - /// `self.null_value()` + pub fn compress_into( self, - vals: impl Iterator, + vals: impl Iterator>, write: &mut impl Write, ) -> io::Result<()> { let mut bitpacker = BitPacker::default(); let mut num_vals = 0; for val in vals { - let compact = self.to_compact(val); + let compact = if let Some(val) = val { + self.to_compact(val) + } else { + self.null_value_compact_space() + }; bitpacker .write(compact, self.params.num_bits, write) .unwrap(); @@ -462,7 +215,7 @@ impl CompactSpaceCompressor { } } -fn train(values_sorted: &[u128], total_num_values: usize) -> CompactSpaceCompressor { +fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpaceCompressor { let compact_space = get_compact_space(values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); let null_value = compact_space.null_value; let null_compact_space = compact_space @@ -476,8 +229,8 @@ fn train(values_sorted: &[u128], total_num_values: usize) -> CompactSpaceCompres ); let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64); - let min_value = *values_sorted.first().unwrap_or(&0); - let max_value = *values_sorted.last().unwrap_or(&0); + let min_value = *values_sorted.iter().next().unwrap_or(&0); + let max_value = *values_sorted.iter().last().unwrap_or(&0); let compressor = CompactSpaceCompressor { params: IPCodecParams { compact_space, @@ -491,8 +244,8 @@ fn train(values_sorted: &[u128], total_num_values: usize) -> CompactSpaceCompres }, }; - let max_value = *values_sorted.last().unwrap_or(&0u128).max(&null_value); - assert!(compressor.to_compact(max_value) < amplitude_compact_space as u64); + let max_value_in_value_space = max_value.max(null_value); + assert!(compressor.to_compact(max_value_in_value_space) < amplitude_compact_space as u64); compressor } @@ -700,27 +453,13 @@ mod tests { use super::*; - #[test] - fn test_binary_heap_pop_order() { - let mut blanks: BinaryHeap = BinaryHeap::new(); - blanks.push(BlankRange { - blank_range: 0..=10, - }); - blanks.push(BlankRange { - blank_range: 100..=200, - }); - blanks.push(BlankRange { - blank_range: 100..=110, - }); - assert_eq!(blanks.pop().unwrap().blank_size(), 100); - assert_eq!(blanks.pop().unwrap().blank_size(), 10); - } - #[test] fn compact_space_test() { let ips = &[ 2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260, - ]; + ] + .into_iter() + .collect(); let compact_space = get_compact_space(ips, ips.len(), 11); assert_eq!(compact_space.null_value, 5); let amplitude = compact_space.amplitude_compact_space(); @@ -745,7 +484,7 @@ mod tests { #[test] fn compact_space_amplitude_test() { - let ips = &[100000u128, 1000000]; + let ips = &[100000u128, 1000000].into_iter().collect(); let compact_space = get_compact_space(ips, ips.len(), 1); assert_eq!(compact_space.null_value, 100001); let amplitude = compact_space.amplitude_compact_space(); @@ -754,8 +493,7 @@ mod tests { fn test_all(data: OwnedBytes, expected: &[u128]) { let decompressor = CompactSpaceDecompressor::open(data).unwrap(); - for idx in 0..decompressor.params.num_vals as usize { - let expected_val = expected[idx]; + for (idx, expected_val) in expected.iter().cloned().enumerate() { let val = decompressor.get(idx as u64); assert_eq!(val, Some(expected_val)); let positions = decompressor.get_range(expected_val.saturating_sub(1)..=expected_val); @@ -771,8 +509,11 @@ mod tests { } fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { - let compressor = CompactSpaceCompressor::train_from(u128_vals.to_vec()); - let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); + let compressor = + CompactSpaceCompressor::train_from(u128_vals.iter().cloned(), u128_vals.len()); + let data = compressor + .compress(u128_vals.iter().cloned().map(Some)) + .unwrap(); let data = OwnedBytes::new(data); test_all(data.clone(), u128_vals); data @@ -846,8 +587,8 @@ mod tests { #[test] fn test_null() { let vals = &[2u128]; - let compressor = CompactSpaceCompressor::train_from(vals.to_vec()); - let vals = vec![compressor.null_value(), 2u128]; + let compressor = CompactSpaceCompressor::train_from(vals.iter().cloned(), vals.len()); + let vals = vec![None, Some(2u128)]; let data = compressor.compress(vals.iter().cloned()).unwrap(); let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); let positions = decomp.get_range(0..=1); @@ -885,9 +626,8 @@ mod tests { 1_000_000, 5_000_000_000, ]; - let compressor = CompactSpaceCompressor::train_from(vals.to_vec()); - // let vals = vec![compressor.null_value(), 2u128]; - let data = compressor.compress(vals.iter().cloned()).unwrap(); + let compressor = CompactSpaceCompressor::train_from(vals.iter().cloned(), vals.len()); + let data = compressor.compress(vals.iter().cloned().map(Some)).unwrap(); let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); assert_eq!(decomp.get_range(199..=200), vec![0]); diff --git a/fastfield_codecs/src/compact_space/blank_range.rs b/fastfield_codecs/src/compact_space/blank_range.rs new file mode 100644 index 000000000..b68508318 --- /dev/null +++ b/fastfield_codecs/src/compact_space/blank_range.rs @@ -0,0 +1,42 @@ +use std::ops::RangeInclusive; + +/// The range of a blank in value space. +/// +/// A blank is an unoccupied space in the data. +/// Ordered by size +/// +/// Use the try_into(), invalid ranges will be rejected. +#[derive(Debug, Eq, PartialEq, Clone)] +pub(crate) struct BlankRange { + blank_range: RangeInclusive, +} +impl TryFrom> for BlankRange { + type Error = &'static str; + fn try_from(range: RangeInclusive) -> Result { + let blank_size = range.end().saturating_sub(*range.start()); + if blank_size < 2 { + Err("invalid range") + } else { + Ok(BlankRange { blank_range: range }) + } + } +} +impl BlankRange { + pub(crate) fn blank_size(&self) -> u128 { + self.blank_range.end() - self.blank_range.start() + } + pub(crate) fn blank_range(&self) -> RangeInclusive { + self.blank_range.clone() + } +} + +impl Ord for BlankRange { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.blank_size().cmp(&other.blank_size()) + } +} +impl PartialOrd for BlankRange { + fn partial_cmp(&self, other: &Self) -> Option { + self.blank_size().partial_cmp(&other.blank_size()) + } +} diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs new file mode 100644 index 000000000..b088729d4 --- /dev/null +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -0,0 +1,237 @@ +use std::collections::{BTreeSet, BinaryHeap}; +use std::ops::RangeInclusive; + +use itertools::Itertools; + +use super::blank_range::BlankRange; +use super::CompactSpace; + +/// Put the blanks for the sorted values into a binary heap +fn get_blanks(values_sorted: &BTreeSet) -> BinaryHeap { + let mut blanks: BinaryHeap = BinaryHeap::new(); + let mut add_range = |blank_range: RangeInclusive| { + let blank_range: Result = blank_range.try_into(); + if let Ok(blank_range) = blank_range { + blanks.push(blank_range); + } + }; + for (first, second) in values_sorted.iter().tuple_windows() { + // Correctness Overflow: the values are deduped and sorted (BTreeSet property), that means + // there's always space between two values. + let blank_range = first + 1..=second - 1; + add_range(blank_range); + } + + // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924 + // Add preceeding range if values don't start at 0 + if let Some(first_val) = values_sorted.iter().next() { + if *first_val != 0 { + let blank_range = 0..=first_val - 1; + add_range(blank_range); + } + } + + // Add succeeding range if values don't end at u128::MAX + if let Some(last_val) = values_sorted.iter().last() { + if *last_val != u128::MAX { + let blank_range = last_val + 1..=u128::MAX; + add_range(blank_range); + } + } + blanks +} + +struct BlankCollector { + blanks: Vec, + staged_blanks_sum: u128, +} +impl BlankCollector { + fn new() -> Self { + Self { + blanks: vec![], + staged_blanks_sum: 0, + } + } + fn stage_blank(&mut self, blank: BlankRange) { + self.staged_blanks_sum += blank.blank_size(); + self.blanks.push(blank); + } + fn drain(&mut self) -> std::vec::Drain<'_, BlankRange> { + self.staged_blanks_sum = 0; + self.blanks.drain(..) + } + fn staged_blanks_sum(&self) -> u128 { + self.staged_blanks_sum + } + fn num_blanks(&self) -> usize { + self.blanks.len() + } +} +fn num_bits(val: u128) -> u8 { + (128u32 - val.leading_zeros()) as u8 +} + +/// Will collect blanks and add them to compact space if more bits are saved than cost from +/// metadata. +pub fn get_compact_space( + values_deduped_sorted: &BTreeSet, + total_num_values: usize, + cost_per_blank: usize, +) -> CompactSpace { + let mut blanks = get_blanks(values_deduped_sorted); + let mut amplitude_compact_space = u128::MAX; + let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); + + let mut compact_space = CompactSpaceBuilder::new(); + if values_deduped_sorted.is_empty() { + return compact_space.finish(); + } + + let mut blank_collector = BlankCollector::new(); + // We will stage blanks until they reduce the compact space by 1 bit. + // Binary heap to process the gaps by their size + while let Some(blank_range) = blanks.pop() { + blank_collector.stage_blank(blank_range); + + let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum(); + // +1 for later added null value + let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum + 1; + let amplitude_new_bits = num_bits(amplitude_new_compact_space); + if amplitude_bits == amplitude_new_bits { + continue; + } + let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * total_num_values; + // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only, + // when amplitude_new_bits changes + let cost = blank_collector.num_blanks() * cost_per_blank; + if cost >= saved_bits { + // Continue here, since although we walk over the blanks by size, + // we can potentially save a lot at the last bits, which are smaller blanks + // + // E.g. if the first range reduces the compact space by 1000 from 2000 to 1000, which + // saves 11-10=1 bit and the next range reduces the compact space by 950 to + // 50, which saves 10-6=4 bit + continue; + } + + amplitude_compact_space = amplitude_new_compact_space; + amplitude_bits = amplitude_new_bits; + compact_space.add_blanks(blank_collector.drain().map(|blank| blank.blank_range())); + } + + // special case, when we don't collected any blanks because: + // * the data is empty (early exit) + // * the algorithm did decide it's not worth the cost, which can be the case for single values + // + // We drain one collected blank unconditionally, so the empty case is reserved for empty + // data, and therefore empty compact_space means the data is empty and no data is covered + // (conversely to all data) and we can assign null to it. + if compact_space.is_empty() { + compact_space.add_blanks( + blank_collector + .drain() + .map(|blank| blank.blank_range()) + .take(1), + ); + } + + compact_space.finish() +} + +#[derive(Debug, Clone, Eq, PartialEq)] +struct CompactSpaceBuilder { + blanks: Vec>, +} + +impl CompactSpaceBuilder { + /// Creates a new compact space builder which will initially cover the whole space. + fn new() -> Self { + Self { blanks: vec![] } + } + + /// Assumes that repeated add_blank calls don't overlap and are not adjacent, + /// e.g. [3..=5, 5..=10] is not allowed + /// + /// Both of those assumptions are true when blanks are produced from sorted values. + fn add_blanks(&mut self, blank: impl Iterator>) { + self.blanks.extend(blank); + } + + fn is_empty(&self) -> bool { + self.blanks.is_empty() + } + + /// Convert blanks to covered space and assign null value + fn finish(mut self) -> CompactSpace { + // sort by start since ranges are not allowed to overlap + self.blanks.sort_by_key(|blank| *blank.start()); + + // Between the blanks + let mut covered_space = self + .blanks + .windows(2) + .map(|blanks| { + assert!( + blanks[0].end() < blanks[1].start(), + "overlapping or adjacent ranges detected" + ); + *blanks[0].end() + 1..=*blanks[1].start() - 1 + }) + .collect::>(); + + // Outside the blanks + if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) { + if *first_blank_start != 0 { + covered_space.insert(0, 0..=first_blank_start - 1); + } + } + + if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) { + if *last_blank_end != u128::MAX { + covered_space.push(last_blank_end + 1..=u128::MAX); + } + } + + // Extend the first range and assign the null value to it. + let null_value = if let Some(first_covered_space) = covered_space.first_mut() { + // in case the first covered space ends at u128::MAX, assign null to the beginning + if *first_covered_space.end() == u128::MAX { + *first_covered_space = first_covered_space.start() - 1..=*first_covered_space.end(); + *first_covered_space.start() + } else { + *first_covered_space = *first_covered_space.start()..=first_covered_space.end() + 1; + *first_covered_space.end() + } + } else { + covered_space.push(0..=0); // empty data case + 0u128 + }; + + let mut compact_start: u64 = 0; + let mut ranges_and_compact_start = Vec::with_capacity(covered_space.len()); + for cov in covered_space { + let covered_range_len = cov.end() - cov.start() + 1; // e.g. 0..=1 covered space 1-0+1= 2 + ranges_and_compact_start.push((cov, compact_start)); + compact_start += covered_range_len as u64; + } + CompactSpace { + ranges_and_compact_start, + null_value, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_binary_heap_pop_order() { + let mut blanks: BinaryHeap = BinaryHeap::new(); + blanks.push((0..=10).try_into().unwrap()); + blanks.push((100..=200).try_into().unwrap()); + blanks.push((100..=110).try_into().unwrap()); + assert_eq!(blanks.pop().unwrap().blank_size(), 100); + assert_eq!(blanks.pop().unwrap().blank_size(), 10); + } +} diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 439d9ecc6..11d166816 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -7,9 +7,12 @@ use std::net::{IpAddr, Ipv6Addr}; use std::str::FromStr; use fastfield_codecs::{ - Column, CompactSpaceCompressor, FastFieldCodecType, FastFieldStats, VecColumn, + Column, CompactSpaceCompressor, CompactSpaceDecompressor, FastFieldCodecType, FastFieldStats, + VecColumn, }; use itertools::Itertools; +use measure_time::print_time; +use ownedbytes::OwnedBytes; use prettytable::{Cell, Row, Table}; fn print_set_stats(ip_addrs: &[u128]) { @@ -51,12 +54,6 @@ fn print_set_stats(ip_addrs: &[u128]) { b.1.cmp(&a.1) } }); - - // println!("\n\n----\nIP Address histogram"); - // println!("IPAddrCount\tFrequency"); - // for (ip_addr_count, times) in cnts { - // println!("{}\t{}", ip_addr_count, times); - //} } fn ip_dataset() -> Vec { @@ -96,9 +93,10 @@ fn bench_ip() { { let mut data = vec![]; for dataset in dataset.chunks(50_000) { - let compressor = CompactSpaceCompressor::train_from(dataset.to_vec()); + let compressor = + CompactSpaceCompressor::train_from(dataset.iter().cloned(), dataset.len()); compressor - .compress_into(dataset.iter().cloned(), &mut data) + .compress_into(dataset.iter().cloned().map(Some), &mut data) .unwrap(); } let compression = data.len() as f64 / (dataset.len() * 16) as f64; @@ -109,8 +107,10 @@ fn bench_ip() { ); } - let compressor = CompactSpaceCompressor::train_from(dataset.to_vec()); - let data = compressor.compress(dataset.iter().cloned()).unwrap(); + let compressor = CompactSpaceCompressor::train_from(dataset.iter().cloned(), dataset.len()); + let data = compressor + .compress(dataset.iter().cloned().map(Some)) + .unwrap(); let compression = data.len() as f64 / (dataset.len() * 16) as f64; println!("Compression {:.2}", compression); @@ -119,12 +119,13 @@ fn bench_ip() { (data.len() * 8) as f32 / dataset.len() as f32 ); - // let decompressor = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); - // for i in 11100..11150 { - // print_time!("get range"); - // let doc_values = decompressor.get_range(dataset[i]..=dataset[i]); - // println!("{:?}", doc_values.len()); - //} + let decompressor = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); + // Sample some ranges + for value in dataset.iter().take(1110).skip(1100).cloned() { + print_time!("get range"); + let doc_values = decompressor.get_range(value..=value); + println!("{:?}", doc_values.len()); + } } fn main() { From 3aeb026970a4935f257ab4fe9093d69204001e7b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 15:39:28 +0800 Subject: [PATCH 06/26] fix blank_size, add comments --- fastfield_codecs/src/compact_space/blank_range.rs | 7 ++++--- .../src/compact_space/build_compact_space.rs | 13 +++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fastfield_codecs/src/compact_space/blank_range.rs b/fastfield_codecs/src/compact_space/blank_range.rs index b68508318..f7843dcd1 100644 --- a/fastfield_codecs/src/compact_space/blank_range.rs +++ b/fastfield_codecs/src/compact_space/blank_range.rs @@ -3,9 +3,10 @@ use std::ops::RangeInclusive; /// The range of a blank in value space. /// /// A blank is an unoccupied space in the data. -/// Ordered by size +/// Use try_into() to construct. +/// A range has to have at least length of 3. Invalid ranges will be rejected. /// -/// Use the try_into(), invalid ranges will be rejected. +/// Ordered by range length. #[derive(Debug, Eq, PartialEq, Clone)] pub(crate) struct BlankRange { blank_range: RangeInclusive, @@ -23,7 +24,7 @@ impl TryFrom> for BlankRange { } impl BlankRange { pub(crate) fn blank_size(&self) -> u128 { - self.blank_range.end() - self.blank_range.start() + self.blank_range.end() - self.blank_range.start() + 1 } pub(crate) fn blank_range(&self) -> RangeInclusive { self.blank_range.clone() diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index b088729d4..255e409b8 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -169,13 +169,14 @@ impl CompactSpaceBuilder { // Between the blanks let mut covered_space = self .blanks - .windows(2) - .map(|blanks| { + .iter() + .tuple_windows() + .map(|(left, right)| { assert!( - blanks[0].end() < blanks[1].start(), + left.end() < right.start(), "overlapping or adjacent ranges detected" ); - *blanks[0].end() + 1..=*blanks[1].start() - 1 + *left.end() + 1..=*right.start() - 1 }) .collect::>(); @@ -231,7 +232,7 @@ mod tests { blanks.push((0..=10).try_into().unwrap()); blanks.push((100..=200).try_into().unwrap()); blanks.push((100..=110).try_into().unwrap()); - assert_eq!(blanks.pop().unwrap().blank_size(), 100); - assert_eq!(blanks.pop().unwrap().blank_size(), 10); + assert_eq!(blanks.pop().unwrap().blank_size(), 101); + assert_eq!(blanks.pop().unwrap().blank_size(), 11); } } From 584394db1eedd88765331609a1a78ce19f466c20 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 15:41:11 +0800 Subject: [PATCH 07/26] fix Cargo.toml --- fastfield_codecs/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index 2ee7e0093..a53a8ceeb 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -16,7 +16,7 @@ prettytable-rs = {version="0.9.0", optional= true} rand = {version="0.8.3", optional= true} fastdivide = "0.4" log = "0.4" -itertools = { version = "0.10.3", optional = true } +itertools = { version = "0.10.3" } measure_time = { version="0.8.2", optional=true} [dev-dependencies] @@ -25,7 +25,7 @@ proptest = "1.0.0" rand = "0.8.3" [features] -bin = ["prettytable-rs", "rand", "itertools", "measure_time"] +bin = ["prettytable-rs", "rand", "measure_time"] default = ["bin"] unstable = [] From 57570b38a229317cb728e571188d60584eb93ce9 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 17:07:34 +0800 Subject: [PATCH 08/26] use vint, forward errors, removed unused var --- fastfield_codecs/src/compact_space.rs | 56 +++++++++---------- .../src/compact_space/build_compact_space.rs | 2 +- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index c8184913d..2d2accb02 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -18,7 +18,7 @@ use std::{ ops::RangeInclusive, }; -use common::{BinarySerializable, CountingWriter, VIntU128}; +use common::{BinarySerializable, CountingWriter, VInt, VIntU128}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{self, BitPacker, BitUnpacker}; @@ -51,7 +51,7 @@ pub struct CompactSpace { impl BinarySerializable for CompactSpace { fn serialize(&self, writer: &mut W) -> io::Result<()> { VIntU128(self.null_value).serialize(writer)?; - VIntU128(self.ranges_and_compact_start.len() as u128).serialize(writer)?; + VInt(self.ranges_and_compact_start.len() as u64).serialize(writer)?; let mut prev_value = 0; for (value_range, _compact) in &self.ranges_and_compact_start { @@ -69,7 +69,7 @@ impl BinarySerializable for CompactSpace { fn deserialize(reader: &mut R) -> io::Result { let null_value = VIntU128::deserialize(reader)?.0; - let num_values = VIntU128::deserialize(reader)?.0; + let num_values = VInt::deserialize(reader)?.0; let mut ranges_and_compact_start: Vec<(RangeInclusive, u64)> = vec![]; let mut value = 0u128; let mut compact = 0u64; @@ -96,9 +96,14 @@ impl BinarySerializable for CompactSpace { } impl CompactSpace { + /// Amplitude is the value range of the compact space including the sentinel value used to identify null values. + /// The compact space is 0..=amplitude . + /// + /// It's only used to verify we don't exceed u64 number space, which would indicate a bug. fn amplitude_compact_space(&self) -> u128 { let last_range = &self.ranges_and_compact_start[self.ranges_and_compact_start.len() - 1]; - last_range.1 as u128 + (last_range.0.end() - last_range.0.start()) + 1 + let last_range_len = last_range.0.end() - last_range.0.start() + 1; + last_range.1 as u128 + last_range_len } fn get_range_and_compact_start(&self, pos: usize) -> &(RangeInclusive, u64) { @@ -147,7 +152,6 @@ pub struct IPCodecParams { compact_space: CompactSpace, bit_unpacker: BitUnpacker, null_value_compact_space: u64, - null_value: u128, min_value: u128, max_value: u128, num_vals: u64, @@ -170,10 +174,6 @@ impl CompactSpaceCompressor { train(&tree, total_num_values_incl_nulls) } - fn to_compact(&self, value: u128) -> u64 { - self.params.compact_space.to_compact(value).unwrap() - } - fn write_footer(mut self, writer: &mut impl Write, num_vals: u64) -> io::Result<()> { let writer = &mut CountingWriter::wrap(writer); self.params.num_vals = num_vals; @@ -200,16 +200,19 @@ impl CompactSpaceCompressor { let mut num_vals = 0; for val in vals { let compact = if let Some(val) = val { - self.to_compact(val) + self.params.compact_space.to_compact(val).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not convert value to compact_space. This is a bug.", + ) + })? } else { self.null_value_compact_space() }; - bitpacker - .write(compact, self.params.num_bits, write) - .unwrap(); + bitpacker.write(compact, self.params.num_bits, write)?; num_vals += 1; } - bitpacker.close(write).unwrap(); + bitpacker.close(write)?; self.write_footer(write, num_vals)?; Ok(()) } @@ -217,9 +220,8 @@ impl CompactSpaceCompressor { fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpaceCompressor { let compact_space = get_compact_space(values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); - let null_value = compact_space.null_value; let null_compact_space = compact_space - .to_compact(null_value) + .to_compact(compact_space.null_value) .expect("could not convert null_value to compact space"); let amplitude_compact_space = compact_space.amplitude_compact_space(); @@ -231,12 +233,18 @@ fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpac let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64); let min_value = *values_sorted.iter().next().unwrap_or(&0); let max_value = *values_sorted.iter().last().unwrap_or(&0); + let max_value_in_value_space = max_value.max(compact_space.null_value); + assert!( + compact_space + .to_compact(max_value_in_value_space) + .expect("could not convert max value to compact space") + < amplitude_compact_space as u64 + ); let compressor = CompactSpaceCompressor { params: IPCodecParams { compact_space, bit_unpacker: BitUnpacker::new(num_bits), null_value_compact_space: null_compact_space, - null_value, min_value, max_value, num_vals: 0, // don't use values_sorted.len() here since they don't include null values @@ -244,8 +252,6 @@ fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpac }, }; - let max_value_in_value_space = max_value.max(null_value); - assert!(compressor.to_compact(max_value_in_value_space) < amplitude_compact_space as u64); compressor } @@ -261,11 +267,7 @@ impl BinarySerializable for IPCodecParams { let footer_flags = 0u64; footer_flags.serialize(writer)?; - let null_value_compact_space = self - .compact_space - .to_compact(self.null_value) - .expect("could not convert null to compact space"); - VIntU128(null_value_compact_space as u128).serialize(writer)?; + VIntU128(self.null_value_compact_space as u128).serialize(writer)?; VIntU128(self.min_value).serialize(writer)?; VIntU128(self.max_value).serialize(writer)?; VIntU128(self.num_vals as u128).serialize(writer)?; @@ -284,10 +286,8 @@ impl BinarySerializable for IPCodecParams { let num_vals = VIntU128::deserialize(reader)?.0 as u64; let num_bits = u8::deserialize(reader)?; let compact_space = CompactSpace::deserialize(reader)?; - let null_value = compact_space.unpack(null_value_compact_space); Ok(Self { - null_value, compact_space, bit_unpacker: BitUnpacker::new(num_bits), null_value_compact_space, @@ -357,9 +357,9 @@ impl CompactSpaceDecompressor { /// Comparing on compact space: 1.2 GElements/s /// /// Comparing on original space: .06 GElements/s (not completely optimized) - pub fn get_range(&self, mut range: RangeInclusive) -> Vec { + pub fn get_range(&self, range: RangeInclusive) -> Vec { if range.start() > range.end() { - range = *range.end()..=*range.start(); + return Vec::new(); } let from_value = *range.start(); let to_value = *range.end(); diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index 255e409b8..b83fd3e4c 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -56,7 +56,7 @@ impl BlankCollector { self.staged_blanks_sum += blank.blank_size(); self.blanks.push(blank); } - fn drain(&mut self) -> std::vec::Drain<'_, BlankRange> { + fn drain(&mut self) -> impl Iterator + '_ { self.staged_blanks_sum = 0; self.blanks.drain(..) } From 9aa9efe2a4257de98788544ee995d7bc3e69f8c5 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 17:10:31 +0800 Subject: [PATCH 09/26] fix bench --- fastfield_codecs/src/lib.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index a1ee70d78..8e62c610e 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -391,8 +391,11 @@ mod bench { let n = permutation.len(); let permutation = permutation.iter().map(|el| *el as u128).collect::>(); - let compressor = CompactSpaceCompressor::train_from(permutation.to_vec()); - let data = compressor.compress(permutation.iter().cloned()).unwrap(); + let compressor = + CompactSpaceCompressor::train_from(permutation.iter().cloned(), permutation.len()); + let data = compressor + .compress(permutation.iter().cloned().map(Some)) + .unwrap(); let data = OwnedBytes::new(data); let column: Arc> = From cae6b28a8ffbac59323b74050b2638ccaf22b376 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 17:16:31 +0800 Subject: [PATCH 10/26] remove num_vals param --- fastfield_codecs/src/compact_space.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 2d2accb02..7e0ac88eb 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -96,8 +96,8 @@ impl BinarySerializable for CompactSpace { } impl CompactSpace { - /// Amplitude is the value range of the compact space including the sentinel value used to identify null values. - /// The compact space is 0..=amplitude . + /// Amplitude is the value range of the compact space including the sentinel value used to + /// identify null values. The compact space is 0..=amplitude . /// /// It's only used to verify we don't exceed u64 number space, which would indicate a bug. fn amplitude_compact_space(&self) -> u128 { @@ -174,9 +174,8 @@ impl CompactSpaceCompressor { train(&tree, total_num_values_incl_nulls) } - fn write_footer(mut self, writer: &mut impl Write, num_vals: u64) -> io::Result<()> { + fn write_footer(self, writer: &mut impl Write) -> io::Result<()> { let writer = &mut CountingWriter::wrap(writer); - self.params.num_vals = num_vals; self.params.serialize(writer)?; let footer_len = writer.written_bytes() as u32; @@ -197,7 +196,6 @@ impl CompactSpaceCompressor { write: &mut impl Write, ) -> io::Result<()> { let mut bitpacker = BitPacker::default(); - let mut num_vals = 0; for val in vals { let compact = if let Some(val) = val { self.params.compact_space.to_compact(val).map_err(|_| { @@ -210,10 +208,9 @@ impl CompactSpaceCompressor { self.null_value_compact_space() }; bitpacker.write(compact, self.params.num_bits, write)?; - num_vals += 1; } bitpacker.close(write)?; - self.write_footer(write, num_vals)?; + self.write_footer(write)?; Ok(()) } } @@ -240,19 +237,17 @@ fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpac .expect("could not convert max value to compact space") < amplitude_compact_space as u64 ); - let compressor = CompactSpaceCompressor { + CompactSpaceCompressor { params: IPCodecParams { compact_space, bit_unpacker: BitUnpacker::new(num_bits), null_value_compact_space: null_compact_space, min_value, max_value, - num_vals: 0, // don't use values_sorted.len() here since they don't include null values + num_vals: total_num_values as u64, num_bits, }, - }; - - compressor + } } #[derive(Debug, Clone)] From 47dc511733bb78c202c82951d7e0b84544fe078d Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 17:19:17 +0800 Subject: [PATCH 11/26] add inline --- fastfield_codecs/src/compact_space.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 7e0ac88eb..195f77d37 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -295,6 +295,7 @@ impl BinarySerializable for IPCodecParams { } impl ColumnV2 for CompactSpaceDecompressor { + #[inline] fn get_val(&self, doc: u64) -> Option { self.get(doc) } @@ -311,6 +312,7 @@ impl ColumnV2 for CompactSpaceDecompressor { self.params.num_vals } + #[inline] fn iter<'a>(&'a self) -> Box> + 'a> { Box::new(self.iter()) } @@ -425,6 +427,7 @@ impl CompactSpaceDecompressor { }) } + #[inline] pub fn get(&self, idx: u64) -> Option { let compact = self.params.bit_unpacker.get(idx, &self.data); if compact == self.params.null_value_compact_space { From 3ca48cd826a03157d599fd19933c3b8278e7bc2d Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 17:43:12 +0800 Subject: [PATCH 12/26] fix test --- fastfield_codecs/src/compact_space.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 195f77d37..553a3853d 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -585,7 +585,7 @@ mod tests { #[test] fn test_null() { let vals = &[2u128]; - let compressor = CompactSpaceCompressor::train_from(vals.iter().cloned(), vals.len()); + let compressor = CompactSpaceCompressor::train_from(vals.iter().cloned(), 2); let vals = vec![None, Some(2u128)]; let data = compressor.compress(vals.iter().cloned()).unwrap(); let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); From 11275854ca499e5d95ae38fd5af7888de1e0021c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 18:03:03 +0800 Subject: [PATCH 13/26] unroll get range iteration --- fastfield_codecs/src/compact_space.rs | 37 +++++++++++++++++++++------ 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 553a3853d..dd711dddd 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -351,7 +351,8 @@ impl CompactSpaceDecompressor { self.params.compact_space.unpack(compact) } - /// Comparing on compact space: 1.2 GElements/s + /// Comparing on compact space: 1.08 GElements/s, which equals a throughput of 17,3 Gb/s + /// (based on u128 = 16byte) /// /// Comparing on original space: .06 GElements/s (not completely optimized) pub fn get_range(&self, range: RangeInclusive) -> Vec { @@ -395,14 +396,34 @@ impl CompactSpaceDecompressor { let range = compact_from..=compact_to; let mut positions = Vec::new(); - for (pos, compact_value) in self - .iter_compact() - .enumerate() - .filter(|(_pos, val)| *val != self.params.null_value_compact_space) - { - if range.contains(&compact_value) { - positions.push(pos as u64); + let step_size = 4; + let cutoff = self.params.num_vals - self.params.num_vals % step_size; + + let mut check_add = |idx, val| { + if range.contains(&val) && val != self.params.null_value_compact_space { + positions.push(idx); } + }; + let get_val = |idx| self.params.bit_unpacker.get(idx as u64, &self.data) as u64; + // unrolled loop + for idx in (0..cutoff).step_by(step_size as usize) { + let idx1 = idx; + let idx2 = idx + 1; + let idx3 = idx + 2; + let idx4 = idx + 3; + let val1 = get_val(idx1); + let val2 = get_val(idx2); + let val3 = get_val(idx3); + let val4 = get_val(idx4); + check_add(idx1, val1); + check_add(idx2, val2); + check_add(idx3, val3); + check_add(idx4, val4); + } + + // handle rest + for idx in cutoff..self.params.num_vals { + check_add(idx, get_val(idx)); } positions From d3e7c41a1f57e025671b00af86db3ed5ba3176be Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 12 Sep 2022 20:27:31 +0800 Subject: [PATCH 14/26] refactor to range_mapping --- fastfield_codecs/src/compact_space.rs | 85 +++++++++++-------- .../src/compact_space/build_compact_space.rs | 14 +-- 2 files changed, 60 insertions(+), 39 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index dd711dddd..5ec33eb48 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -44,17 +44,33 @@ const COST_PER_BLANK_IN_BITS: usize = 36; #[derive(Debug, Clone, Eq, PartialEq)] pub struct CompactSpace { - ranges_and_compact_start: Vec<(RangeInclusive, u64)>, + ranges_mapping: Vec, pub null_value: u128, } +/// Maps the range from the original space to compact_start + range.len() +#[derive(Debug, Clone, Eq, PartialEq)] +struct RangeMapping { + value_range: RangeInclusive, + compact_start: u64, +} +impl RangeMapping { + fn range_length(&self) -> u64 { + (self.value_range.end() - self.value_range.start()) as u64 + 1 + } +} + impl BinarySerializable for CompactSpace { fn serialize(&self, writer: &mut W) -> io::Result<()> { VIntU128(self.null_value).serialize(writer)?; - VInt(self.ranges_and_compact_start.len() as u64).serialize(writer)?; + VInt(self.ranges_mapping.len() as u64).serialize(writer)?; let mut prev_value = 0; - for (value_range, _compact) in &self.ranges_and_compact_start { + for value_range in self + .ranges_mapping + .iter() + .map(|range_mapping| &range_mapping.value_range) + { let blank_delta_start = value_range.start() - prev_value; VIntU128(blank_delta_start).serialize(writer)?; prev_value = *value_range.start(); @@ -70,9 +86,9 @@ impl BinarySerializable for CompactSpace { fn deserialize(reader: &mut R) -> io::Result { let null_value = VIntU128::deserialize(reader)?.0; let num_values = VInt::deserialize(reader)?.0; - let mut ranges_and_compact_start: Vec<(RangeInclusive, u64)> = vec![]; + let mut ranges_mapping: Vec = vec![]; let mut value = 0u128; - let mut compact = 0u64; + let mut compact_start = 0u64; for _ in 0..num_values { let blank_delta_start = VIntU128::deserialize(reader)?.0; value += blank_delta_start; @@ -82,15 +98,18 @@ impl BinarySerializable for CompactSpace { value += blank_delta_end; let blank_end = value; - let compact_delta = blank_end - blank_start + 1; - - ranges_and_compact_start.push((blank_start..=blank_end, compact)); - compact += compact_delta as u64; + let range_mapping = RangeMapping { + value_range: blank_start..=blank_end, + compact_start, + }; + let compact_delta = range_mapping.range_length(); + ranges_mapping.push(range_mapping); + compact_start += compact_delta as u64; } Ok(Self { null_value, - ranges_and_compact_start, + ranges_mapping, }) } } @@ -101,21 +120,20 @@ impl CompactSpace { /// /// It's only used to verify we don't exceed u64 number space, which would indicate a bug. fn amplitude_compact_space(&self) -> u128 { - let last_range = &self.ranges_and_compact_start[self.ranges_and_compact_start.len() - 1]; - let last_range_len = last_range.0.end() - last_range.0.start() + 1; - last_range.1 as u128 + last_range_len + let last_range = &self.ranges_mapping[self.ranges_mapping.len() - 1]; + last_range.compact_start as u128 + last_range.range_length() as u128 } - fn get_range_and_compact_start(&self, pos: usize) -> &(RangeInclusive, u64) { - &self.ranges_and_compact_start[pos] + fn get_range_mapping(&self, pos: usize) -> &RangeMapping { + &self.ranges_mapping[pos] } /// Returns either Ok(the value in the compact space) or if it is outside the compact space the /// Err(position where it would be inserted) fn to_compact(&self, value: u128) -> Result { - self.ranges_and_compact_start + self.ranges_mapping .binary_search_by(|probe| { - let value_range = &probe.0; + let value_range = &probe.value_range; if value_range.contains(&value) { return Ordering::Equal; } else if value < *value_range.start() { @@ -126,21 +144,24 @@ impl CompactSpace { panic!("not covered all ranges in check"); }) .map(|pos| { - let (range, compact_start) = &self.ranges_and_compact_start[pos]; - compact_start + (value - range.start()) as u64 + let range_mapping = &self.ranges_mapping[pos]; + let pos_in_range = (value - range_mapping.value_range.start()) as u64; + range_mapping.compact_start + pos_in_range }) } /// Unpacks a value from compact space u64 to u128 space fn unpack(&self, compact: u64) -> u128 { let pos = self - .ranges_and_compact_start - .binary_search_by_key(&compact, |probe| probe.1) + .ranges_mapping + .binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start) + // Correctness: Overflow. The first range starts at compact space 0, the error from + // binary search can never be 0 .map_or_else(|e| e - 1, |v| v); - let range_and_compact_start = &self.ranges_and_compact_start[pos]; - let diff = compact - self.ranges_and_compact_start[pos].1; - range_and_compact_start.0.start() + diff as u128 + let range_mapping = &self.ranges_mapping[pos]; + let diff = compact - range_mapping.compact_start; + range_mapping.value_range.start() + diff as u128 } } @@ -375,22 +396,18 @@ impl CompactSpaceDecompressor { let compact_from = compact_from.unwrap_or_else(|pos| { // Correctness: Out of bounds, if this value is Err(last_index + 1), we early exit, // since the to_value also mapps into the same non-mapped space - let range_and_compact_start = - self.params.compact_space.get_range_and_compact_start(pos); - range_and_compact_start.1 + let range_mapping = self.params.compact_space.get_range_mapping(pos); + range_mapping.compact_start }); // If there is no compact space, we go to the closest upperbound compact space let compact_to = compact_to.unwrap_or_else(|pos| { // Correctness: Overflow, if this value is Err(0), we early exit, // since the from_value also mapps into the same non-mapped space - // get end of previous range + // Get end of previous range let pos = pos - 1; - let range_and_compact_start = - self.params.compact_space.get_range_and_compact_start(pos); - let compact_end = range_and_compact_start.1 - + (range_and_compact_start.0.end() - range_and_compact_start.0.start()) as u64; - compact_end + let range_mapping = self.params.compact_space.get_range_mapping(pos); + range_mapping.compact_start + range_mapping.range_length() }); let range = compact_from..=compact_to; @@ -404,7 +421,7 @@ impl CompactSpaceDecompressor { positions.push(idx); } }; - let get_val = |idx| self.params.bit_unpacker.get(idx as u64, &self.data) as u64; + let get_val = |idx| self.params.bit_unpacker.get(idx as u64, &self.data); // unrolled loop for idx in (0..cutoff).step_by(step_size as usize) { let idx1 = idx; diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index b83fd3e4c..0a3386ce1 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -163,7 +163,7 @@ impl CompactSpaceBuilder { /// Convert blanks to covered space and assign null value fn finish(mut self) -> CompactSpace { - // sort by start since ranges are not allowed to overlap + // sort by start. ranges are not allowed to overlap self.blanks.sort_by_key(|blank| *blank.start()); // Between the blanks @@ -209,14 +209,18 @@ impl CompactSpaceBuilder { }; let mut compact_start: u64 = 0; - let mut ranges_and_compact_start = Vec::with_capacity(covered_space.len()); + let mut ranges_mapping = Vec::with_capacity(covered_space.len()); for cov in covered_space { - let covered_range_len = cov.end() - cov.start() + 1; // e.g. 0..=1 covered space 1-0+1= 2 - ranges_and_compact_start.push((cov, compact_start)); + let range_mapping = super::RangeMapping { + value_range: cov, + compact_start, + }; + let covered_range_len = range_mapping.range_length(); + ranges_mapping.push(range_mapping); compact_start += covered_range_len as u64; } CompactSpace { - ranges_and_compact_start, + ranges_mapping, null_value, } } From 58af1235e4c38d4b8ce5659faca7a66115845d13 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 13 Sep 2022 05:40:40 +0200 Subject: [PATCH 15/26] Apply suggestions from code review Co-authored-by: Paul Masurel --- fastfield_codecs/src/compact_space.rs | 11 +++++------ fastfield_codecs/src/compact_space/blank_range.rs | 2 +- .../src/compact_space/build_compact_space.rs | 6 +++--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 5ec33eb48..38f39fa1a 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -134,14 +134,13 @@ impl CompactSpace { self.ranges_mapping .binary_search_by(|probe| { let value_range = &probe.value_range; - if value_range.contains(&value) { - return Ordering::Equal; - } else if value < *value_range.start() { - return Ordering::Greater; + if value < *value_range.start() { + Ordering::Greater } else if value > *value_range.end() { - return Ordering::Less; + Ordering::Less + } else { + Ordering::Equal } - panic!("not covered all ranges in check"); }) .map(|pos| { let range_mapping = &self.ranges_mapping[pos]; diff --git a/fastfield_codecs/src/compact_space/blank_range.rs b/fastfield_codecs/src/compact_space/blank_range.rs index f7843dcd1..11a9c7eda 100644 --- a/fastfield_codecs/src/compact_space/blank_range.rs +++ b/fastfield_codecs/src/compact_space/blank_range.rs @@ -38,6 +38,6 @@ impl Ord for BlankRange { } impl PartialOrd for BlankRange { fn partial_cmp(&self, other: &Self) -> Option { - self.blank_size().partial_cmp(&other.blank_size()) + Some(self.blank_size().cmp(&other.blank_size)) } } diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index 0a3386ce1..238408ac5 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -78,7 +78,7 @@ pub fn get_compact_space( total_num_values: usize, cost_per_blank: usize, ) -> CompactSpace { - let mut blanks = get_blanks(values_deduped_sorted); + let mut blanks: BinaryHeap = get_blanks(values_deduped_sorted); let mut amplitude_compact_space = u128::MAX; let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); @@ -164,7 +164,7 @@ impl CompactSpaceBuilder { /// Convert blanks to covered space and assign null value fn finish(mut self) -> CompactSpace { // sort by start. ranges are not allowed to overlap - self.blanks.sort_by_key(|blank| *blank.start()); + self.blanks.sort_unstable_by_key(|blank| *blank.start()); // Between the blanks let mut covered_space = self @@ -209,7 +209,7 @@ impl CompactSpaceBuilder { }; let mut compact_start: u64 = 0; - let mut ranges_mapping = Vec::with_capacity(covered_space.len()); + let mut ranges_mapping: Vec = Vec::with_capacity(covered_space.len()); for cov in covered_space { let range_mapping = super::RangeMapping { value_range: cov, From 61b5110db76be4028967fd9f3ed3a67c9dff2a51 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 13 Sep 2022 15:43:14 +0800 Subject: [PATCH 16/26] use 0 as null in compact space --- fastfield_codecs/src/compact_space.rs | 128 ++++++++++-------- .../src/compact_space/blank_range.rs | 2 +- .../src/compact_space/build_compact_space.rs | 59 +++----- 3 files changed, 91 insertions(+), 98 deletions(-) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space.rs index 38f39fa1a..63479494b 100644 --- a/fastfield_codecs/src/compact_space.rs +++ b/fastfield_codecs/src/compact_space.rs @@ -36,6 +36,8 @@ pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { u128::from_be_bytes(ip_addr_v6.octets()) } +const NULL_VALUE_COMPACT_SPACE: u64 = 0; + /// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of /// blanks depends on the number of blanks. /// @@ -45,7 +47,6 @@ const COST_PER_BLANK_IN_BITS: usize = 36; #[derive(Debug, Clone, Eq, PartialEq)] pub struct CompactSpace { ranges_mapping: Vec, - pub null_value: u128, } /// Maps the range from the original space to compact_start + range.len() @@ -58,11 +59,15 @@ impl RangeMapping { fn range_length(&self) -> u64 { (self.value_range.end() - self.value_range.start()) as u64 + 1 } + + // The last value of the compact space in this range + fn compact_end(&self) -> u64 { + self.compact_start + self.range_length() - 1 + } } impl BinarySerializable for CompactSpace { fn serialize(&self, writer: &mut W) -> io::Result<()> { - VIntU128(self.null_value).serialize(writer)?; VInt(self.ranges_mapping.len() as u64).serialize(writer)?; let mut prev_value = 0; @@ -84,12 +89,11 @@ impl BinarySerializable for CompactSpace { } fn deserialize(reader: &mut R) -> io::Result { - let null_value = VIntU128::deserialize(reader)?.0; - let num_values = VInt::deserialize(reader)?.0; + let num_ranges = VInt::deserialize(reader)?.0; let mut ranges_mapping: Vec = vec![]; let mut value = 0u128; - let mut compact_start = 0u64; - for _ in 0..num_values { + let mut compact_start = 1u64; // 0 is reserved for `null` + for _ in 0..num_ranges { let blank_delta_start = VIntU128::deserialize(reader)?.0; value += blank_delta_start; let blank_start = value; @@ -102,15 +106,12 @@ impl BinarySerializable for CompactSpace { value_range: blank_start..=blank_end, compact_start, }; - let compact_delta = range_mapping.range_length(); + let range_length = range_mapping.range_length(); ranges_mapping.push(range_mapping); - compact_start += compact_delta as u64; + compact_start += range_length as u64; } - Ok(Self { - null_value, - ranges_mapping, - }) + Ok(Self { ranges_mapping }) } } @@ -120,8 +121,10 @@ impl CompactSpace { /// /// It's only used to verify we don't exceed u64 number space, which would indicate a bug. fn amplitude_compact_space(&self) -> u128 { - let last_range = &self.ranges_mapping[self.ranges_mapping.len() - 1]; - last_range.compact_start as u128 + last_range.range_length() as u128 + self.ranges_mapping + .last() + .map(|last_range| last_range.compact_end() as u128 + 1) + .unwrap_or(1) // compact space starts at 1, 0 == null } fn get_range_mapping(&self, pos: usize) -> &RangeMapping { @@ -171,7 +174,6 @@ pub struct CompactSpaceCompressor { pub struct IPCodecParams { compact_space: CompactSpace, bit_unpacker: BitUnpacker, - null_value_compact_space: u64, min_value: u128, max_value: u128, num_vals: u64, @@ -179,10 +181,6 @@ pub struct IPCodecParams { } impl CompactSpaceCompressor { - pub fn null_value_compact_space(&self) -> u64 { - self.params.null_value_compact_space - } - /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. pub fn train_from( vals: impl Iterator, @@ -225,7 +223,7 @@ impl CompactSpaceCompressor { ) })? } else { - self.null_value_compact_space() + NULL_VALUE_COMPACT_SPACE }; bitpacker.write(compact, self.params.num_bits, write)?; } @@ -237,9 +235,6 @@ impl CompactSpaceCompressor { fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpaceCompressor { let compact_space = get_compact_space(values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); - let null_compact_space = compact_space - .to_compact(compact_space.null_value) - .expect("could not convert null_value to compact space"); let amplitude_compact_space = compact_space.amplitude_compact_space(); assert!( @@ -250,10 +245,9 @@ fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpac let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64); let min_value = *values_sorted.iter().next().unwrap_or(&0); let max_value = *values_sorted.iter().last().unwrap_or(&0); - let max_value_in_value_space = max_value.max(compact_space.null_value); assert!( compact_space - .to_compact(max_value_in_value_space) + .to_compact(max_value) .expect("could not convert max value to compact space") < amplitude_compact_space as u64 ); @@ -261,7 +255,6 @@ fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpac params: IPCodecParams { compact_space, bit_unpacker: BitUnpacker::new(num_bits), - null_value_compact_space: null_compact_space, min_value, max_value, num_vals: total_num_values as u64, @@ -282,7 +275,6 @@ impl BinarySerializable for IPCodecParams { let footer_flags = 0u64; footer_flags.serialize(writer)?; - VIntU128(self.null_value_compact_space as u128).serialize(writer)?; VIntU128(self.min_value).serialize(writer)?; VIntU128(self.max_value).serialize(writer)?; VIntU128(self.num_vals as u128).serialize(writer)?; @@ -295,7 +287,6 @@ impl BinarySerializable for IPCodecParams { fn deserialize(reader: &mut R) -> io::Result { let _header_flags = u64::deserialize(reader)?; - let null_value_compact_space = VIntU128::deserialize(reader)?.0 as u64; let min_value = VIntU128::deserialize(reader)?.0; let max_value = VIntU128::deserialize(reader)?.0; let num_vals = VIntU128::deserialize(reader)?.0 as u64; @@ -305,7 +296,6 @@ impl BinarySerializable for IPCodecParams { Ok(Self { compact_space, bit_unpacker: BitUnpacker::new(num_bits), - null_value_compact_space, min_value, max_value, num_vals, @@ -406,7 +396,7 @@ impl CompactSpaceDecompressor { // Get end of previous range let pos = pos - 1; let range_mapping = self.params.compact_space.get_range_mapping(pos); - range_mapping.compact_start + range_mapping.range_length() + range_mapping.compact_end() }); let range = compact_from..=compact_to; @@ -415,8 +405,8 @@ impl CompactSpaceDecompressor { let step_size = 4; let cutoff = self.params.num_vals - self.params.num_vals % step_size; - let mut check_add = |idx, val| { - if range.contains(&val) && val != self.params.null_value_compact_space { + let mut add_if_in_range = |idx, val| { + if range.contains(&val) { positions.push(idx); } }; @@ -431,15 +421,15 @@ impl CompactSpaceDecompressor { let val2 = get_val(idx2); let val3 = get_val(idx3); let val4 = get_val(idx4); - check_add(idx1, val1); - check_add(idx2, val2); - check_add(idx3, val3); - check_add(idx4, val4); + add_if_in_range(idx1, val1); + add_if_in_range(idx2, val2); + add_if_in_range(idx3, val3); + add_if_in_range(idx4, val4); } // handle rest for idx in cutoff..self.params.num_vals { - check_add(idx, get_val(idx)); + add_if_in_range(idx, get_val(idx)); } positions @@ -456,7 +446,7 @@ impl CompactSpaceDecompressor { // TODO: Performance. It would be better to iterate on the ranges and check existence via // the bit_unpacker. self.iter_compact().map(|compact| { - if compact == self.params.null_value_compact_space { + if compact == NULL_VALUE_COMPACT_SPACE { None } else { Some(self.compact_to_u128(compact)) @@ -467,7 +457,7 @@ impl CompactSpaceDecompressor { #[inline] pub fn get(&self, idx: u64) -> Option { let compact = self.params.bit_unpacker.get(idx, &self.data); - if compact == self.params.null_value_compact_space { + if compact == NULL_VALUE_COMPACT_SPACE { None } else { Some(self.compact_to_u128(compact)) @@ -496,13 +486,19 @@ mod tests { .into_iter() .collect(); let compact_space = get_compact_space(ips, ips.len(), 11); - assert_eq!(compact_space.null_value, 5); let amplitude = compact_space.amplitude_compact_space(); assert_eq!(amplitude, 20); - assert_eq!(2, compact_space.to_compact(2).unwrap()); - assert_eq!(3, compact_space.to_compact(3).unwrap()); + assert_eq!(3, compact_space.to_compact(2).unwrap()); + assert_eq!(4, compact_space.to_compact(3).unwrap()); assert_eq!(compact_space.to_compact(100).unwrap_err(), 1); + for (num1, num2) in (0..3).tuple_windows() { + assert_eq!( + compact_space.get_range_mapping(num1).compact_end() + 1, + compact_space.get_range_mapping(num2).compact_start + ); + } + let mut output: Vec = Vec::new(); compact_space.serialize(&mut output).unwrap(); @@ -521,39 +517,50 @@ mod tests { fn compact_space_amplitude_test() { let ips = &[100000u128, 1000000].into_iter().collect(); let compact_space = get_compact_space(ips, ips.len(), 1); - assert_eq!(compact_space.null_value, 100001); let amplitude = compact_space.amplitude_compact_space(); assert_eq!(amplitude, 3); } - fn test_all(data: OwnedBytes, expected: &[u128]) { + fn test_all(data: OwnedBytes, expected: &[Option]) { let decompressor = CompactSpaceDecompressor::open(data).unwrap(); for (idx, expected_val) in expected.iter().cloned().enumerate() { let val = decompressor.get(idx as u64); - assert_eq!(val, Some(expected_val)); - let positions = decompressor.get_range(expected_val.saturating_sub(1)..=expected_val); - assert!(positions.contains(&(idx as u64))); - let positions = decompressor.get_range(expected_val..=expected_val); - assert!(positions.contains(&(idx as u64))); - let positions = decompressor.get_range(expected_val..=expected_val.saturating_add(1)); - assert!(positions.contains(&(idx as u64))); - let positions = decompressor - .get_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1)); - assert!(positions.contains(&(idx as u64))); + assert_eq!(val, expected_val); + + if let Some(expected_val) = expected_val { + let test_range = |range: RangeInclusive| { + let expected_positions = expected + .iter() + .positions(|val| val.map(|val| range.contains(&val)).unwrap_or(false)) + .map(|pos| pos as u64) + .collect::>(); + let positions = decompressor.get_range(range); + assert_eq!(positions, expected_positions); + }; + + test_range(expected_val.saturating_sub(1)..=expected_val); + test_range(expected_val..=expected_val); + test_range(expected_val..=expected_val.saturating_add(1)); + test_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1)); + } } } - fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { - let compressor = - CompactSpaceCompressor::train_from(u128_vals.iter().cloned(), u128_vals.len()); - let data = compressor - .compress(u128_vals.iter().cloned().map(Some)) - .unwrap(); + fn test_aux_vals_opt(u128_vals: &[Option]) -> OwnedBytes { + let compressor = CompactSpaceCompressor::train_from( + u128_vals.iter().cloned().flatten(), + u128_vals.len(), + ); + let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); let data = OwnedBytes::new(data); test_all(data.clone(), u128_vals); data } + fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { + test_aux_vals_opt(&u128_vals.iter().cloned().map(Some).collect::>()) + } + #[test] fn test_range_1() { let vals = &[ @@ -694,6 +701,7 @@ mod tests { let vals = &[1_000_000_000u128; 100]; let _data = test_aux_vals(vals); } + use itertools::Itertools; use proptest::prelude::*; fn num_strategy() -> impl Strategy { diff --git a/fastfield_codecs/src/compact_space/blank_range.rs b/fastfield_codecs/src/compact_space/blank_range.rs index 11a9c7eda..a1f265f00 100644 --- a/fastfield_codecs/src/compact_space/blank_range.rs +++ b/fastfield_codecs/src/compact_space/blank_range.rs @@ -38,6 +38,6 @@ impl Ord for BlankRange { } impl PartialOrd for BlankRange { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.blank_size().cmp(&other.blank_size)) + Some(self.blank_size().cmp(&other.blank_size())) } } diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index 238408ac5..b72ea2243 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -4,7 +4,7 @@ use std::ops::RangeInclusive; use itertools::Itertools; use super::blank_range::BlankRange; -use super::CompactSpace; +use super::{CompactSpace, RangeMapping}; /// Put the blanks for the sorted values into a binary heap fn get_blanks(values_sorted: &BTreeSet) -> BinaryHeap { @@ -78,15 +78,15 @@ pub fn get_compact_space( total_num_values: usize, cost_per_blank: usize, ) -> CompactSpace { - let mut blanks: BinaryHeap = get_blanks(values_deduped_sorted); - let mut amplitude_compact_space = u128::MAX; - let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); - let mut compact_space = CompactSpaceBuilder::new(); if values_deduped_sorted.is_empty() { return compact_space.finish(); } + let mut blanks: BinaryHeap = get_blanks(values_deduped_sorted); + let mut amplitude_compact_space = u128::MAX; + let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); + let mut blank_collector = BlankCollector::new(); // We will stage blanks until they reduce the compact space by 1 bit. // Binary heap to process the gaps by their size @@ -166,49 +166,37 @@ impl CompactSpaceBuilder { // sort by start. ranges are not allowed to overlap self.blanks.sort_unstable_by_key(|blank| *blank.start()); - // Between the blanks - let mut covered_space = self - .blanks - .iter() - .tuple_windows() - .map(|(left, right)| { - assert!( - left.end() < right.start(), - "overlapping or adjacent ranges detected" - ); - *left.end() + 1..=*right.start() - 1 - }) - .collect::>(); + let mut covered_space = Vec::with_capacity(self.blanks.len()); - // Outside the blanks + // begining of the blanks if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) { if *first_blank_start != 0 { - covered_space.insert(0, 0..=first_blank_start - 1); + covered_space.push(0..=first_blank_start - 1); } } + // Between the blanks + let between_blanks = self.blanks.iter().tuple_windows().map(|(left, right)| { + assert!( + left.end() < right.start(), + "overlapping or adjacent ranges detected" + ); + *left.end() + 1..=*right.start() - 1 + }); + covered_space.extend(between_blanks); + + // end of the blanks if let Some(last_blank_end) = self.blanks.last().map(RangeInclusive::end) { if *last_blank_end != u128::MAX { covered_space.push(last_blank_end + 1..=u128::MAX); } } - // Extend the first range and assign the null value to it. - let null_value = if let Some(first_covered_space) = covered_space.first_mut() { - // in case the first covered space ends at u128::MAX, assign null to the beginning - if *first_covered_space.end() == u128::MAX { - *first_covered_space = first_covered_space.start() - 1..=*first_covered_space.end(); - *first_covered_space.start() - } else { - *first_covered_space = *first_covered_space.start()..=first_covered_space.end() + 1; - *first_covered_space.end() - } - } else { + if covered_space.is_empty() { covered_space.push(0..=0); // empty data case - 0u128 }; - let mut compact_start: u64 = 0; + let mut compact_start: u64 = 1; // 0 is reserved for `null` let mut ranges_mapping: Vec = Vec::with_capacity(covered_space.len()); for cov in covered_space { let range_mapping = super::RangeMapping { @@ -219,10 +207,7 @@ impl CompactSpaceBuilder { ranges_mapping.push(range_mapping); compact_start += covered_range_len as u64; } - CompactSpace { - ranges_mapping, - null_value, - } + CompactSpace { ranges_mapping } } } From 570009b5b1343d20f1da414a36f8ecd2ec490948 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 13 Sep 2022 16:45:35 +0800 Subject: [PATCH 17/26] move to mod.rs --- fastfield_codecs/src/{compact_space.rs => compact_space/mod.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename fastfield_codecs/src/{compact_space.rs => compact_space/mod.rs} (100%) diff --git a/fastfield_codecs/src/compact_space.rs b/fastfield_codecs/src/compact_space/mod.rs similarity index 100% rename from fastfield_codecs/src/compact_space.rs rename to fastfield_codecs/src/compact_space/mod.rs From 592caeefa053c8e8d59943f76d6f30b66e8b6075 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 14 Sep 2022 18:34:37 +0800 Subject: [PATCH 18/26] renames --- .../src/compact_space/build_compact_space.rs | 4 ++-- fastfield_codecs/src/compact_space/mod.rs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index b72ea2243..8cdf589f7 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -63,7 +63,7 @@ impl BlankCollector { fn staged_blanks_sum(&self) -> u128 { self.staged_blanks_sum } - fn num_blanks(&self) -> usize { + fn num_staged_blanks(&self) -> usize { self.blanks.len() } } @@ -103,7 +103,7 @@ pub fn get_compact_space( let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * total_num_values; // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only, // when amplitude_new_bits changes - let cost = blank_collector.num_blanks() * cost_per_blank; + let cost = blank_collector.num_staged_blanks() * cost_per_blank; if cost >= saved_bits { // Continue here, since although we walk over the blanks by size, // we can potentially save a lot at the last bits, which are smaller blanks diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index 63479494b..d64bd923e 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -405,7 +405,7 @@ impl CompactSpaceDecompressor { let step_size = 4; let cutoff = self.params.num_vals - self.params.num_vals % step_size; - let mut add_if_in_range = |idx, val| { + let mut push_if_in_range = |idx, val| { if range.contains(&val) { positions.push(idx); } @@ -421,15 +421,15 @@ impl CompactSpaceDecompressor { let val2 = get_val(idx2); let val3 = get_val(idx3); let val4 = get_val(idx4); - add_if_in_range(idx1, val1); - add_if_in_range(idx2, val2); - add_if_in_range(idx3, val3); - add_if_in_range(idx4, val4); + push_if_in_range(idx1, val1); + push_if_in_range(idx2, val2); + push_if_in_range(idx3, val3); + push_if_in_range(idx4, val4); } // handle rest for idx in cutoff..self.params.num_vals { - add_if_in_range(idx, get_val(idx)); + push_if_in_range(idx, get_val(idx)); } positions From 237b64025eebf3da343e2000a674f7e3448056fc Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 15 Sep 2022 17:10:37 +0800 Subject: [PATCH 19/26] take ColumnV2 as parameter improve algorithm stricter assertions improve names --- fastfield_codecs/src/column.rs | 36 +++++ .../src/compact_space/build_compact_space.rs | 73 +++++----- fastfield_codecs/src/compact_space/mod.rs | 137 +++++++++--------- fastfield_codecs/src/main.rs | 5 +- 4 files changed, 145 insertions(+), 106 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 00e0c092b..02dbb804f 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -152,6 +152,42 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { } } +impl<'a, T: Copy + PartialOrd> ColumnV2 for VecColumn<'a, T> { + fn get_val(&self, position: u64) -> Option { + Some(self.values[position as usize]) + } + + fn min_value(&self) -> T { + self.min_value + } + + fn max_value(&self) -> T { + self.max_value + } + + fn num_vals(&self) -> u64 { + self.values.len() as u64 + } +} + +impl<'a, T: Copy + PartialOrd> ColumnV2 for VecColumn<'a, Option> { + fn get_val(&self, position: u64) -> Option { + self.values[position as usize] + } + + fn min_value(&self) -> T { + self.min_value.unwrap() + } + + fn max_value(&self) -> T { + self.max_value.unwrap() + } + + fn num_vals(&self) -> u64 { + self.values.len() as u64 + } +} + impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T> where V: AsRef<[T]> + ?Sized { diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index 8cdf589f7..3b7006500 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -1,4 +1,5 @@ use std::collections::{BTreeSet, BinaryHeap}; +use std::iter; use std::ops::RangeInclusive; use itertools::Itertools; @@ -9,35 +10,16 @@ use super::{CompactSpace, RangeMapping}; /// Put the blanks for the sorted values into a binary heap fn get_blanks(values_sorted: &BTreeSet) -> BinaryHeap { let mut blanks: BinaryHeap = BinaryHeap::new(); - let mut add_range = |blank_range: RangeInclusive| { - let blank_range: Result = blank_range.try_into(); - if let Ok(blank_range) = blank_range { - blanks.push(blank_range); - } - }; for (first, second) in values_sorted.iter().tuple_windows() { // Correctness Overflow: the values are deduped and sorted (BTreeSet property), that means // there's always space between two values. let blank_range = first + 1..=second - 1; - add_range(blank_range); - } - - // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924 - // Add preceeding range if values don't start at 0 - if let Some(first_val) = values_sorted.iter().next() { - if *first_val != 0 { - let blank_range = 0..=first_val - 1; - add_range(blank_range); + let blank_range: Result = blank_range.try_into(); + if let Ok(blank_range) = blank_range { + blanks.push(blank_range); } } - // Add succeeding range if values don't end at u128::MAX - if let Some(last_val) = values_sorted.iter().last() { - if *last_val != u128::MAX { - let blank_range = last_val + 1..=u128::MAX; - add_range(blank_range); - } - } blanks } @@ -75,32 +57,46 @@ fn num_bits(val: u128) -> u8 { /// metadata. pub fn get_compact_space( values_deduped_sorted: &BTreeSet, - total_num_values: usize, + total_num_values: u64, cost_per_blank: usize, ) -> CompactSpace { - let mut compact_space = CompactSpaceBuilder::new(); + let mut compact_space_builder = CompactSpaceBuilder::new(); if values_deduped_sorted.is_empty() { - return compact_space.finish(); + return compact_space_builder.finish(); } let mut blanks: BinaryHeap = get_blanks(values_deduped_sorted); - let mut amplitude_compact_space = u128::MAX; + // Replace after stabilization of https://github.com/rust-lang/rust/issues/62924 + + // We start by space that's limited to min_value..=max_value + let min_value = *values_deduped_sorted.iter().next().unwrap_or(&0); + let max_value = *values_deduped_sorted.iter().last().unwrap_or(&0); + + // +1 for null, in case min and max covers the whole space, we are off by one. + let mut amplitude_compact_space = (max_value - min_value).saturating_add(1); + if min_value != 0 { + compact_space_builder.add_blanks(iter::once(0..=min_value - 1)); + } + if max_value != u128::MAX { + compact_space_builder.add_blanks(iter::once(max_value + 1..=u128::MAX)); + } + let mut amplitude_bits: u8 = num_bits(amplitude_compact_space); let mut blank_collector = BlankCollector::new(); - // We will stage blanks until they reduce the compact space by 1 bit. + // We will stage blanks until they reduce the compact space by at least 1 bit and then flush + // them if the metadata cost is lower than the total number of saved bits. // Binary heap to process the gaps by their size while let Some(blank_range) = blanks.pop() { blank_collector.stage_blank(blank_range); let staged_spaces_sum: u128 = blank_collector.staged_blanks_sum(); - // +1 for later added null value - let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum + 1; + let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum; let amplitude_new_bits = num_bits(amplitude_new_compact_space); if amplitude_bits == amplitude_new_bits { continue; } - let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * total_num_values; + let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * total_num_values as usize; // TODO: Maybe calculate exact cost of blanks and run this more expensive computation only, // when amplitude_new_bits changes let cost = blank_collector.num_staged_blanks() * cost_per_blank; @@ -116,7 +112,7 @@ pub fn get_compact_space( amplitude_compact_space = amplitude_new_compact_space; amplitude_bits = amplitude_new_bits; - compact_space.add_blanks(blank_collector.drain().map(|blank| blank.blank_range())); + compact_space_builder.add_blanks(blank_collector.drain().map(|blank| blank.blank_range())); } // special case, when we don't collected any blanks because: @@ -126,8 +122,8 @@ pub fn get_compact_space( // We drain one collected blank unconditionally, so the empty case is reserved for empty // data, and therefore empty compact_space means the data is empty and no data is covered // (conversely to all data) and we can assign null to it. - if compact_space.is_empty() { - compact_space.add_blanks( + if compact_space_builder.is_empty() { + compact_space_builder.add_blanks( blank_collector .drain() .map(|blank| blank.blank_range()) @@ -135,7 +131,14 @@ pub fn get_compact_space( ); } - compact_space.finish() + let compact_space = compact_space_builder.finish(); + if max_value - min_value != u128::MAX { + debug_assert_eq!( + compact_space.amplitude_compact_space(), + amplitude_compact_space + ); + } + compact_space } #[derive(Debug, Clone, Eq, PartialEq)] @@ -146,7 +149,7 @@ struct CompactSpaceBuilder { impl CompactSpaceBuilder { /// Creates a new compact space builder which will initially cover the whole space. fn new() -> Self { - Self { blanks: vec![] } + Self { blanks: Vec::new() } } /// Assumes that repeated add_blank calls don't overlap and are not adjacent, diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index d64bd923e..7428ca8e7 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -123,7 +123,7 @@ impl CompactSpace { fn amplitude_compact_space(&self) -> u128 { self.ranges_mapping .last() - .map(|last_range| last_range.compact_end() as u128 + 1) + .map(|last_range| last_range.compact_end() as u128) .unwrap_or(1) // compact space starts at 1, 0 == null } @@ -133,7 +133,7 @@ impl CompactSpace { /// Returns either Ok(the value in the compact space) or if it is outside the compact space the /// Err(position where it would be inserted) - fn to_compact(&self, value: u128) -> Result { + fn u128_to_compact(&self, value: u128) -> Result { self.ranges_mapping .binary_search_by(|probe| { let value_range = &probe.value_range; @@ -153,7 +153,7 @@ impl CompactSpace { } /// Unpacks a value from compact space u64 to u128 space - fn unpack(&self, compact: u64) -> u128 { + fn compact_to_u128(&self, compact: u64) -> u128 { let pos = self .ranges_mapping .binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start) @@ -182,14 +182,39 @@ pub struct IPCodecParams { impl CompactSpaceCompressor { /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. - pub fn train_from( - vals: impl Iterator, - total_num_values_incl_nulls: usize, - ) -> Self { - let mut tree = BTreeSet::new(); - tree.extend(vals); - assert!(tree.len() <= total_num_values_incl_nulls); - train(&tree, total_num_values_incl_nulls) + pub fn train_from(column: impl ColumnV2) -> Self { + let mut values_sorted = BTreeSet::new(); + values_sorted.extend(column.iter().flatten()); + let total_num_values = column.num_vals(); + + let compact_space = + get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); + let amplitude_compact_space = compact_space.amplitude_compact_space(); + + assert!( + amplitude_compact_space <= u64::MAX as u128, + "case unsupported." + ); + + let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64); + let min_value = *values_sorted.iter().next().unwrap_or(&0); + let max_value = *values_sorted.iter().last().unwrap_or(&0); + assert_eq!( + compact_space + .u128_to_compact(max_value) + .expect("could not convert max value to compact space"), + amplitude_compact_space as u64 + ); + CompactSpaceCompressor { + params: IPCodecParams { + compact_space, + bit_unpacker: BitUnpacker::new(num_bits), + min_value, + max_value, + num_vals: total_num_values as u64, + num_bits, + }, + } } fn write_footer(self, writer: &mut impl Write) -> io::Result<()> { @@ -216,12 +241,15 @@ impl CompactSpaceCompressor { let mut bitpacker = BitPacker::default(); for val in vals { let compact = if let Some(val) = val { - self.params.compact_space.to_compact(val).map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, - "Could not convert value to compact_space. This is a bug.", - ) - })? + self.params + .compact_space + .u128_to_compact(val) + .map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not convert value to compact_space. This is a bug.", + ) + })? } else { NULL_VALUE_COMPACT_SPACE }; @@ -233,36 +261,6 @@ impl CompactSpaceCompressor { } } -fn train(values_sorted: &BTreeSet, total_num_values: usize) -> CompactSpaceCompressor { - let compact_space = get_compact_space(values_sorted, total_num_values, COST_PER_BLANK_IN_BITS); - let amplitude_compact_space = compact_space.amplitude_compact_space(); - - assert!( - amplitude_compact_space <= u64::MAX as u128, - "case unsupported." - ); - - let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64); - let min_value = *values_sorted.iter().next().unwrap_or(&0); - let max_value = *values_sorted.iter().last().unwrap_or(&0); - assert!( - compact_space - .to_compact(max_value) - .expect("could not convert max value to compact space") - < amplitude_compact_space as u64 - ); - CompactSpaceCompressor { - params: IPCodecParams { - compact_space, - bit_unpacker: BitUnpacker::new(num_bits), - min_value, - max_value, - num_vals: total_num_values as u64, - num_bits, - }, - } -} - #[derive(Debug, Clone)] pub struct CompactSpaceDecompressor { data: OwnedBytes, @@ -353,12 +351,12 @@ impl CompactSpaceDecompressor { /// /// and we want a mapping for 1005, there is no equivalent compact space. We instead return an /// error with the index of the next range. - fn to_compact(&self, value: u128) -> Result { - self.params.compact_space.to_compact(value) + fn u128_to_compact(&self, value: u128) -> Result { + self.params.compact_space.u128_to_compact(value) } fn compact_to_u128(&self, compact: u64) -> u128 { - self.params.compact_space.unpack(compact) + self.params.compact_space.compact_to_u128(compact) } /// Comparing on compact space: 1.08 GElements/s, which equals a throughput of 17,3 Gb/s @@ -372,8 +370,8 @@ impl CompactSpaceDecompressor { let from_value = *range.start(); let to_value = *range.end(); assert!(to_value >= from_value); - let compact_from = self.to_compact(from_value); - let compact_to = self.to_compact(to_value); + let compact_from = self.u128_to_compact(from_value); + let compact_to = self.u128_to_compact(to_value); // Quick return, if both ranges fall into the same non-mapped space, the range can't cover // any values, so we can early exit @@ -477,6 +475,7 @@ impl CompactSpaceDecompressor { mod tests { use super::*; + use crate::VecColumn; #[test] fn compact_space_test() { @@ -485,12 +484,12 @@ mod tests { ] .into_iter() .collect(); - let compact_space = get_compact_space(ips, ips.len(), 11); + let compact_space = get_compact_space(ips, ips.len() as u64, 11); let amplitude = compact_space.amplitude_compact_space(); - assert_eq!(amplitude, 20); - assert_eq!(3, compact_space.to_compact(2).unwrap()); - assert_eq!(4, compact_space.to_compact(3).unwrap()); - assert_eq!(compact_space.to_compact(100).unwrap_err(), 1); + assert_eq!(amplitude, 17); + assert_eq!(1, compact_space.u128_to_compact(2).unwrap()); + assert_eq!(2, compact_space.u128_to_compact(3).unwrap()); + assert_eq!(compact_space.u128_to_compact(100).unwrap_err(), 1); for (num1, num2) in (0..3).tuple_windows() { assert_eq!( @@ -508,17 +507,17 @@ mod tests { ); for ip in ips { - let compact = compact_space.to_compact(*ip).unwrap(); - assert_eq!(compact_space.unpack(compact), *ip); + let compact = compact_space.u128_to_compact(*ip).unwrap(); + assert_eq!(compact_space.compact_to_u128(compact), *ip); } } #[test] fn compact_space_amplitude_test() { let ips = &[100000u128, 1000000].into_iter().collect(); - let compact_space = get_compact_space(ips, ips.len(), 1); + let compact_space = get_compact_space(ips, ips.len() as u64, 1); let amplitude = compact_space.amplitude_compact_space(); - assert_eq!(amplitude, 3); + assert_eq!(amplitude, 2); } fn test_all(data: OwnedBytes, expected: &[Option]) { @@ -547,10 +546,7 @@ mod tests { } fn test_aux_vals_opt(u128_vals: &[Option]) -> OwnedBytes { - let compressor = CompactSpaceCompressor::train_from( - u128_vals.iter().cloned().flatten(), - u128_vals.len(), - ); + let compressor = CompactSpaceCompressor::train_from(VecColumn::from(u128_vals)); let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); let data = OwnedBytes::new(data); test_all(data.clone(), u128_vals); @@ -628,9 +624,8 @@ mod tests { #[test] fn test_null() { - let vals = &[2u128]; - let compressor = CompactSpaceCompressor::train_from(vals.iter().cloned(), 2); let vals = vec![None, Some(2u128)]; + let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&vals)); let data = compressor.compress(vals.iter().cloned()).unwrap(); let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); let positions = decomp.get_range(0..=1); @@ -668,7 +663,7 @@ mod tests { 1_000_000, 5_000_000_000, ]; - let compressor = CompactSpaceCompressor::train_from(vals.iter().cloned(), vals.len()); + let compressor = CompactSpaceCompressor::train_from(VecColumn::from(vals)); let data = compressor.compress(vals.iter().cloned().map(Some)).unwrap(); let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); @@ -696,6 +691,12 @@ mod tests { let _data = test_aux_vals(vals); } + #[test] + fn test_bug4() { + let vals = &[340282366920938463463374607431768211455, 0]; + let _data = test_aux_vals(vals); + } + #[test] fn test_first_large_gaps() { let vals = &[1_000_000_000u128; 100]; diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 11d166816..ef39df0b9 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -93,8 +93,7 @@ fn bench_ip() { { let mut data = vec![]; for dataset in dataset.chunks(50_000) { - let compressor = - CompactSpaceCompressor::train_from(dataset.iter().cloned(), dataset.len()); + let compressor = CompactSpaceCompressor::train_from(VecColumn::from(dataset)); compressor .compress_into(dataset.iter().cloned().map(Some), &mut data) .unwrap(); @@ -107,7 +106,7 @@ fn bench_ip() { ); } - let compressor = CompactSpaceCompressor::train_from(dataset.iter().cloned(), dataset.len()); + let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&dataset)); let data = compressor .compress(dataset.iter().cloned().map(Some)) .unwrap(); From 9f610b25af308db5613eb1316920b570f1d0859e Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 15 Sep 2022 20:37:47 +0800 Subject: [PATCH 20/26] fix benches, add benches --- fastfield_codecs/benches/bench.rs | 9 +++- fastfield_codecs/src/lib.rs | 84 ++++++++++++++++++++++++++----- fastfield_codecs/src/serialize.rs | 2 +- 3 files changed, 79 insertions(+), 16 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index c30df44e5..5a4227dca 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -28,12 +28,14 @@ mod tests { } fn get_reader_for_bench(data: &[u64]) -> Codec::Reader { let mut bytes = Vec::new(); + let min_value = *data.iter().min().unwrap(); + let data = data.iter().map(|el| *el - min_value).collect::>(); let col = VecColumn::from(&data); let normalized_header = fastfield_codecs::NormalizedHeader { num_vals: col.num_vals(), max_value: col.max_value(), }; - Codec::serialize(&VecColumn::from(data), &mut bytes).unwrap(); + Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap(); Codec::open_from_bytes(OwnedBytes::new(bytes), normalized_header).unwrap() } fn bench_get(b: &mut Bencher, data: &[u64]) { @@ -65,10 +67,13 @@ mod tests { bench_get_dynamic_helper(b, col); } fn bench_create(b: &mut Bencher, data: &[u64]) { + let min_value = *data.iter().min().unwrap(); + let data = data.iter().map(|el| *el - min_value).collect::>(); + let mut bytes = Vec::new(); b.iter(|| { bytes.clear(); - Codec::serialize(&VecColumn::from(data), &mut bytes).unwrap(); + Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap(); }); } diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 8e62c610e..d2adf6500 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -335,13 +335,14 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { + use std::iter; use std::sync::Arc; + use column::ColumnV2Ext; use rand::prelude::*; use test::{self, Bencher}; use super::*; - use crate::column::ColumnV2; use crate::Column; // Warning: this generates the same permutation at each call @@ -385,31 +386,88 @@ mod bench { }); } - #[bench] - fn bench_intfastfield_jumpy_fflookup_u128(b: &mut Bencher) { + fn get_u128_column_permutation() -> Arc> { let permutation = generate_permutation(); - let n = permutation.len(); - let permutation = permutation.iter().map(|el| *el as u128).collect::>(); - - let compressor = - CompactSpaceCompressor::train_from(permutation.iter().cloned(), permutation.len()); - let data = compressor - .compress(permutation.iter().cloned().map(Some)) - .unwrap(); + let permutation = permutation + .iter() + .map(|el| *el as u128) + .map(Some) + .collect::>(); + get_u128_column(&permutation) + } + fn get_data_50percent_item() -> (u128, u128, Vec>) { + let mut permutation = generate_permutation(); + let major_item = permutation[0]; + let minor_item = permutation[1]; + permutation.extend(iter::repeat(major_item).take(permutation.len())); + permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); + let permutation = permutation + .iter() + .map(|el| Some(*el as u128)) + .collect::>(); + (major_item as u128, minor_item as u128, permutation) + } + fn get_u128_column(data: &[Option]) -> Arc> { + let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&data)); + let data = compressor.compress(data.iter().cloned()).unwrap(); let data = OwnedBytes::new(data); - let column: Arc> = + let column: Arc> = Arc::new(CompactSpaceDecompressor::open(data).unwrap()); + column + } + + #[bench] + fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) { + let (major_item, _minor_item, data) = get_data_50percent_item(); + let column = get_u128_column(&data); + + b.iter(|| column.get_between_vals(major_item..=major_item)); + } + + #[bench] + fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) { + let (_major_item, minor_item, data) = get_data_50percent_item(); + let column = get_u128_column(&data); + + b.iter(|| column.get_between_vals(minor_item..=minor_item)); + } + + #[bench] + fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) { + let (_major_item, _minor_item, data) = get_data_50percent_item(); + let column = get_u128_column(&data); + + b.iter(|| column.get_between_vals(0..=u128::MAX)); + } + + #[bench] + fn bench_intfastfield_jumpy_fflookup_u128(b: &mut Bencher) { + let column = get_u128_column_permutation(); b.iter(|| { let mut a = 0u128; - for _ in 0..n { + for _ in 0..column.num_vals() { a = column.get_val(a as u64).unwrap(); } a }); } + #[bench] + fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) { + let column = get_u128_column_permutation(); + + b.iter(|| { + let n = column.num_vals(); + let mut a = 0u128; + for i in (0..n / 5).map(|val| val * 5) { + a += column.get_val(i as u64).unwrap(); + } + a + }); + } + #[bench] fn bench_intfastfield_stride7_vec(b: &mut Bencher) { let permutation = generate_permutation(); diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index 4753bb443..abb82eeb1 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -215,7 +215,7 @@ pub fn serialize_and_load( column: &[T], ) -> Arc> { let mut buffer = Vec::new(); - super::serialize(VecColumn::from(column), &mut buffer, &ALL_CODEC_TYPES).unwrap(); + super::serialize(VecColumn::from(&column), &mut buffer, &ALL_CODEC_TYPES).unwrap(); super::open(OwnedBytes::new(buffer)).unwrap() } From e2e6c94ba81b9619f980952a45aa96e27d310c77 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 16 Sep 2022 13:49:02 +0800 Subject: [PATCH 21/26] remove ColumnV2 --- fastfield_codecs/src/column.rs | 81 ++------------ fastfield_codecs/src/compact_space/mod.rs | 124 +++++++--------------- fastfield_codecs/src/lib.rs | 25 ++--- fastfield_codecs/src/main.rs | 6 +- 4 files changed, 57 insertions(+), 179 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 02dbb804f..b20d39c97 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -53,43 +53,8 @@ pub trait Column: Send + Sync { } } -/// Concept of new Column API, which better accounts for null values. -pub trait ColumnV2 { - /// Return the value associated to the given idx. - /// - /// This accessor should return as fast as possible. - /// - /// # Panics - /// - /// May panic if `idx` is greater than the column length. - fn get_val(&self, idx: u64) -> Option; - - /// Returns the minimum value for this fast field. - /// - /// This min_value may not be exact. - /// For instance, the min value does not take in account of possible - /// deleted document. All values are however guaranteed to be higher than - /// `.min_value()`. - fn min_value(&self) -> T; - - /// Returns the maximum value for this fast field. - /// - /// This max_value may not be exact. - /// For instance, the max value does not take in account of possible - /// deleted document. All values are however guaranteed to be higher than - /// `.max_value()`. - fn max_value(&self) -> T; - - fn num_vals(&self) -> u64; - - /// Returns a iterator over the data - fn iter<'a>(&'a self) -> Box> + 'a> { - Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) - } -} - -/// Extend ColumnV2 Api -pub trait ColumnV2Ext: ColumnV2 { +/// Extend Column Api +pub trait ColumnExt: Column { /// Return the positions of values which are in the provided range. fn get_between_vals(&self, range: RangeInclusive) -> Vec; } @@ -152,44 +117,9 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { } } -impl<'a, T: Copy + PartialOrd> ColumnV2 for VecColumn<'a, T> { - fn get_val(&self, position: u64) -> Option { - Some(self.values[position as usize]) - } - - fn min_value(&self) -> T { - self.min_value - } - - fn max_value(&self) -> T { - self.max_value - } - - fn num_vals(&self) -> u64 { - self.values.len() as u64 - } -} - -impl<'a, T: Copy + PartialOrd> ColumnV2 for VecColumn<'a, Option> { - fn get_val(&self, position: u64) -> Option { - self.values[position as usize] - } - - fn min_value(&self) -> T { - self.min_value.unwrap() - } - - fn max_value(&self) -> T { - self.max_value.unwrap() - } - - fn num_vals(&self) -> u64 { - self.values.len() as u64 - } -} - impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T> -where V: AsRef<[T]> + ?Sized +where + V: AsRef<[T]> + ?Sized, { fn from(values: &'a V) -> Self { let values = values.as_ref(); @@ -287,7 +217,8 @@ where pub struct IterColumn(T); impl From for IterColumn -where T: Iterator + Clone + ExactSizeIterator +where + T: Iterator + Clone + ExactSizeIterator, { fn from(iter: T) -> Self { IterColumn(iter) diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index 7428ca8e7..2c13e508a 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -22,8 +22,8 @@ use common::{BinarySerializable, CountingWriter, VInt, VIntU128}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{self, BitPacker, BitUnpacker}; -use crate::column::{ColumnV2, ColumnV2Ext}; use crate::compact_space::build_compact_space::get_compact_space; +use crate::{column::ColumnExt, Column}; mod blank_range; mod build_compact_space; @@ -36,8 +36,6 @@ pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { u128::from_be_bytes(ip_addr_v6.octets()) } -const NULL_VALUE_COMPACT_SPACE: u64 = 0; - /// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of /// blanks depends on the number of blanks. /// @@ -182,9 +180,9 @@ pub struct IPCodecParams { impl CompactSpaceCompressor { /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. - pub fn train_from(column: impl ColumnV2) -> Self { + pub fn train_from(column: impl Column) -> Self { let mut values_sorted = BTreeSet::new(); - values_sorted.extend(column.iter().flatten()); + values_sorted.extend(column.iter()); let total_num_values = column.num_vals(); let compact_space = @@ -227,7 +225,7 @@ impl CompactSpaceCompressor { Ok(()) } - pub fn compress(self, vals: impl Iterator>) -> io::Result> { + pub fn compress(self, vals: impl Iterator) -> io::Result> { let mut output = vec![]; self.compress_into(vals, &mut output)?; Ok(output) @@ -235,24 +233,21 @@ impl CompactSpaceCompressor { pub fn compress_into( self, - vals: impl Iterator>, + vals: impl Iterator, write: &mut impl Write, ) -> io::Result<()> { let mut bitpacker = BitPacker::default(); for val in vals { - let compact = if let Some(val) = val { - self.params - .compact_space - .u128_to_compact(val) - .map_err(|_| { - io::Error::new( - io::ErrorKind::InvalidData, - "Could not convert value to compact_space. This is a bug.", - ) - })? - } else { - NULL_VALUE_COMPACT_SPACE - }; + let compact = self + .params + .compact_space + .u128_to_compact(val) + .map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not convert value to compact_space. This is a bug.", + ) + })?; bitpacker.write(compact, self.params.num_bits, write)?; } bitpacker.close(write)?; @@ -302,9 +297,9 @@ impl BinarySerializable for IPCodecParams { } } -impl ColumnV2 for CompactSpaceDecompressor { +impl Column for CompactSpaceDecompressor { #[inline] - fn get_val(&self, doc: u64) -> Option { + fn get_val(&self, doc: u64) -> u128 { self.get(doc) } @@ -321,12 +316,12 @@ impl ColumnV2 for CompactSpaceDecompressor { } #[inline] - fn iter<'a>(&'a self) -> Box> + 'a> { + fn iter<'a>(&'a self) -> Box + 'a> { Box::new(self.iter()) } } -impl ColumnV2Ext for CompactSpaceDecompressor { +impl ColumnExt for CompactSpaceDecompressor { fn get_between_vals(&self, range: RangeInclusive) -> Vec { self.get_range(range) } @@ -440,26 +435,17 @@ impl CompactSpaceDecompressor { } #[inline] - fn iter(&self) -> impl Iterator> + '_ { + fn iter(&self) -> impl Iterator + '_ { // TODO: Performance. It would be better to iterate on the ranges and check existence via // the bit_unpacker. - self.iter_compact().map(|compact| { - if compact == NULL_VALUE_COMPACT_SPACE { - None - } else { - Some(self.compact_to_u128(compact)) - } - }) + self.iter_compact() + .map(|compact| self.compact_to_u128(compact)) } #[inline] - pub fn get(&self, idx: u64) -> Option { + pub fn get(&self, idx: u64) -> u128 { let compact = self.params.bit_unpacker.get(idx, &self.data); - if compact == NULL_VALUE_COMPACT_SPACE { - None - } else { - Some(self.compact_to_u128(compact)) - } + self.compact_to_u128(compact) } pub fn min_value(&self) -> u128 { @@ -520,32 +506,30 @@ mod tests { assert_eq!(amplitude, 2); } - fn test_all(data: OwnedBytes, expected: &[Option]) { + fn test_all(data: OwnedBytes, expected: &[u128]) { let decompressor = CompactSpaceDecompressor::open(data).unwrap(); for (idx, expected_val) in expected.iter().cloned().enumerate() { let val = decompressor.get(idx as u64); assert_eq!(val, expected_val); - if let Some(expected_val) = expected_val { - let test_range = |range: RangeInclusive| { - let expected_positions = expected - .iter() - .positions(|val| val.map(|val| range.contains(&val)).unwrap_or(false)) - .map(|pos| pos as u64) - .collect::>(); - let positions = decompressor.get_range(range); - assert_eq!(positions, expected_positions); - }; + let test_range = |range: RangeInclusive| { + let expected_positions = expected + .iter() + .positions(|val| range.contains(val)) + .map(|pos| pos as u64) + .collect::>(); + let positions = decompressor.get_range(range); + assert_eq!(positions, expected_positions); + }; - test_range(expected_val.saturating_sub(1)..=expected_val); - test_range(expected_val..=expected_val); - test_range(expected_val..=expected_val.saturating_add(1)); - test_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1)); - } + test_range(expected_val.saturating_sub(1)..=expected_val); + test_range(expected_val..=expected_val); + test_range(expected_val..=expected_val.saturating_add(1)); + test_range(expected_val.saturating_sub(1)..=expected_val.saturating_add(1)); } } - fn test_aux_vals_opt(u128_vals: &[Option]) -> OwnedBytes { + fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { let compressor = CompactSpaceCompressor::train_from(VecColumn::from(u128_vals)); let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); let data = OwnedBytes::new(data); @@ -553,10 +537,6 @@ mod tests { data } - fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { - test_aux_vals_opt(&u128_vals.iter().cloned().map(Some).collect::>()) - } - #[test] fn test_range_1() { let vals = &[ @@ -622,30 +602,6 @@ mod tests { assert_eq!(positions, vec![0]); } - #[test] - fn test_null() { - let vals = vec![None, Some(2u128)]; - let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&vals)); - let data = compressor.compress(vals.iter().cloned()).unwrap(); - let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); - let positions = decomp.get_range(0..=1); - assert_eq!(positions, vec![]); - let positions = decomp.get_range(2..=2); - assert_eq!(positions, vec![1]); - - let positions = decomp.get_range(2..=3); - assert_eq!(positions, vec![1]); - - let positions = decomp.get_range(1..=3); - assert_eq!(positions, vec![1]); - - let positions = decomp.get_range(2..=3); - assert_eq!(positions, vec![1]); - - let positions = decomp.get_range(3..=3); - assert_eq!(positions, vec![]); - } - #[test] fn test_range_3() { let vals = &[ @@ -664,7 +620,7 @@ mod tests { 5_000_000_000, ]; let compressor = CompactSpaceCompressor::train_from(VecColumn::from(vals)); - let data = compressor.compress(vals.iter().cloned().map(Some)).unwrap(); + let data = compressor.compress(vals.iter().cloned()).unwrap(); let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); assert_eq!(decomp.get_range(199..=200), vec![0]); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index d2adf6500..ab3842cb4 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -338,7 +338,7 @@ mod bench { use std::iter; use std::sync::Arc; - use column::ColumnV2Ext; + use column::ColumnExt; use rand::prelude::*; use test::{self, Bencher}; @@ -386,33 +386,26 @@ mod bench { }); } - fn get_u128_column_permutation() -> Arc> { + fn get_u128_column_permutation() -> Arc> { let permutation = generate_permutation(); - let permutation = permutation - .iter() - .map(|el| *el as u128) - .map(Some) - .collect::>(); + let permutation = permutation.iter().map(|el| *el as u128).collect::>(); get_u128_column(&permutation) } - fn get_data_50percent_item() -> (u128, u128, Vec>) { + fn get_data_50percent_item() -> (u128, u128, Vec) { let mut permutation = generate_permutation(); let major_item = permutation[0]; let minor_item = permutation[1]; permutation.extend(iter::repeat(major_item).take(permutation.len())); permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); - let permutation = permutation - .iter() - .map(|el| Some(*el as u128)) - .collect::>(); + let permutation = permutation.iter().map(|el| *el as u128).collect::>(); (major_item as u128, minor_item as u128, permutation) } - fn get_u128_column(data: &[Option]) -> Arc> { + fn get_u128_column(data: &[u128]) -> Arc> { let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&data)); let data = compressor.compress(data.iter().cloned()).unwrap(); let data = OwnedBytes::new(data); - let column: Arc> = + let column: Arc> = Arc::new(CompactSpaceDecompressor::open(data).unwrap()); column } @@ -448,7 +441,7 @@ mod bench { b.iter(|| { let mut a = 0u128; for _ in 0..column.num_vals() { - a = column.get_val(a as u64).unwrap(); + a = column.get_val(a as u64); } a }); @@ -462,7 +455,7 @@ mod bench { let n = column.num_vals(); let mut a = 0u128; for i in (0..n / 5).map(|val| val * 5) { - a += column.get_val(i as u64).unwrap(); + a += column.get_val(i as u64); } a }); diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index ef39df0b9..3ec4cf15d 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -95,7 +95,7 @@ fn bench_ip() { for dataset in dataset.chunks(50_000) { let compressor = CompactSpaceCompressor::train_from(VecColumn::from(dataset)); compressor - .compress_into(dataset.iter().cloned().map(Some), &mut data) + .compress_into(dataset.iter().cloned(), &mut data) .unwrap(); } let compression = data.len() as f64 / (dataset.len() * 16) as f64; @@ -107,9 +107,7 @@ fn bench_ip() { } let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&dataset)); - let data = compressor - .compress(dataset.iter().cloned().map(Some)) - .unwrap(); + let data = compressor.compress(dataset.iter().cloned()).unwrap(); let compression = data.len() as f64 / (dataset.len() * 16) as f64; println!("Compression {:.2}", compression); From e75472ec9a58ea474790872f64c9c7c4d935c03a Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 16 Sep 2022 14:53:38 +0800 Subject: [PATCH 22/26] add serialize_u128, open_u128, refactor --- fastfield_codecs/src/column.rs | 6 +- .../src/compact_space/build_compact_space.rs | 1 + fastfield_codecs/src/compact_space/mod.rs | 72 +++++++++---------- fastfield_codecs/src/lib.rs | 25 ++++--- fastfield_codecs/src/main.rs | 18 ++--- fastfield_codecs/src/serialize.rs | 14 ++++ 6 files changed, 73 insertions(+), 63 deletions(-) diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index b20d39c97..424b29f61 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -118,8 +118,7 @@ impl<'a, T: Copy + PartialOrd + Send + Sync> Column for VecColumn<'a, T> { } impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T> -where - V: AsRef<[T]> + ?Sized, +where V: AsRef<[T]> + ?Sized { fn from(values: &'a V) -> Self { let values = values.as_ref(); @@ -217,8 +216,7 @@ where pub struct IterColumn(T); impl From for IterColumn -where - T: Iterator + Clone + ExactSizeIterator, +where T: Iterator + Clone + ExactSizeIterator { fn from(iter: T) -> Self { IterColumn(iter) diff --git a/fastfield_codecs/src/compact_space/build_compact_space.rs b/fastfield_codecs/src/compact_space/build_compact_space.rs index 3b7006500..02bbe97f8 100644 --- a/fastfield_codecs/src/compact_space/build_compact_space.rs +++ b/fastfield_codecs/src/compact_space/build_compact_space.rs @@ -210,6 +210,7 @@ impl CompactSpaceBuilder { ranges_mapping.push(range_mapping); compact_start += covered_range_len as u64; } + // println!("num ranges {}", ranges_mapping.len()); CompactSpace { ranges_mapping } } } diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index 2c13e508a..aea1933f6 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -22,8 +22,9 @@ use common::{BinarySerializable, CountingWriter, VInt, VIntU128}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{self, BitPacker, BitUnpacker}; +use crate::column::ColumnExt; use crate::compact_space::build_compact_space::get_compact_space; -use crate::{column::ColumnExt, Column}; +use crate::Column; mod blank_range; mod build_compact_space; @@ -180,7 +181,7 @@ pub struct IPCodecParams { impl CompactSpaceCompressor { /// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals. - pub fn train_from(column: impl Column) -> Self { + pub fn train_from(column: &impl Column) -> Self { let mut values_sorted = BTreeSet::new(); values_sorted.extend(column.iter()); let total_num_values = column.num_vals(); @@ -225,12 +226,6 @@ impl CompactSpaceCompressor { Ok(()) } - pub fn compress(self, vals: impl Iterator) -> io::Result> { - let mut output = vec![]; - self.compress_into(vals, &mut output)?; - Ok(output) - } - pub fn compress_into( self, vals: impl Iterator, @@ -323,7 +318,7 @@ impl Column for CompactSpaceDecompressor { impl ColumnExt for CompactSpaceDecompressor { fn get_between_vals(&self, range: RangeInclusive) -> Vec { - self.get_range(range) + self.get_between_vals(range) } } @@ -358,7 +353,7 @@ impl CompactSpaceDecompressor { /// (based on u128 = 16byte) /// /// Comparing on original space: .06 GElements/s (not completely optimized) - pub fn get_range(&self, range: RangeInclusive) -> Vec { + pub fn get_between_vals(&self, range: RangeInclusive) -> Vec { if range.start() > range.end() { return Vec::new(); } @@ -461,7 +456,7 @@ impl CompactSpaceDecompressor { mod tests { use super::*; - use crate::VecColumn; + use crate::{open_u128, serialize_u128, VecColumn}; #[test] fn compact_space_test() { @@ -518,7 +513,7 @@ mod tests { .positions(|val| range.contains(val)) .map(|pos| pos as u64) .collect::>(); - let positions = decompressor.get_range(range); + let positions = decompressor.get_between_vals(range); assert_eq!(positions, expected_positions); }; @@ -530,9 +525,10 @@ mod tests { } fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes { - let compressor = CompactSpaceCompressor::train_from(VecColumn::from(u128_vals)); - let data = compressor.compress(u128_vals.iter().cloned()).unwrap(); - let data = OwnedBytes::new(data); + let mut out = Vec::new(); + serialize_u128(VecColumn::from(u128_vals), &mut out).unwrap(); + + let data = OwnedBytes::new(out); test_all(data.clone(), u128_vals); data } @@ -552,24 +548,24 @@ mod tests { ]; let data = test_aux_vals(vals); let decomp = CompactSpaceDecompressor::open(data).unwrap(); - let positions = decomp.get_range(0..=1); + let positions = decomp.get_between_vals(0..=1); assert_eq!(positions, vec![0]); - let positions = decomp.get_range(0..=2); + let positions = decomp.get_between_vals(0..=2); assert_eq!(positions, vec![0]); - let positions = decomp.get_range(0..=3); + let positions = decomp.get_between_vals(0..=3); assert_eq!(positions, vec![0, 2]); - assert_eq!(decomp.get_range(99999u128..=99999u128), vec![3]); - assert_eq!(decomp.get_range(99999u128..=100000u128), vec![3, 4]); - assert_eq!(decomp.get_range(99998u128..=100000u128), vec![3, 4]); - assert_eq!(decomp.get_range(99998u128..=99999u128), vec![3]); - assert_eq!(decomp.get_range(99998u128..=99998u128), vec![]); - assert_eq!(decomp.get_range(333u128..=333u128), vec![8]); - assert_eq!(decomp.get_range(332u128..=333u128), vec![8]); - assert_eq!(decomp.get_range(332u128..=334u128), vec![8]); - assert_eq!(decomp.get_range(333u128..=334u128), vec![8]); + assert_eq!(decomp.get_between_vals(99999u128..=99999u128), vec![3]); + assert_eq!(decomp.get_between_vals(99999u128..=100000u128), vec![3, 4]); + assert_eq!(decomp.get_between_vals(99998u128..=100000u128), vec![3, 4]); + assert_eq!(decomp.get_between_vals(99998u128..=99999u128), vec![3]); + assert_eq!(decomp.get_between_vals(99998u128..=99998u128), vec![]); + assert_eq!(decomp.get_between_vals(333u128..=333u128), vec![8]); + assert_eq!(decomp.get_between_vals(332u128..=333u128), vec![8]); + assert_eq!(decomp.get_between_vals(332u128..=334u128), vec![8]); + assert_eq!(decomp.get_between_vals(333u128..=334u128), vec![8]); assert_eq!( - decomp.get_range(4_000_211_221u128..=5_000_000_000u128), + decomp.get_between_vals(4_000_211_221u128..=5_000_000_000u128), vec![6, 7] ); } @@ -594,11 +590,11 @@ mod tests { ]; let data = test_aux_vals(vals); let decomp = CompactSpaceDecompressor::open(data).unwrap(); - let positions = decomp.get_range(0..=5); + let positions = decomp.get_between_vals(0..=5); assert_eq!(positions, vec![]); - let positions = decomp.get_range(0..=100); + let positions = decomp.get_between_vals(0..=100); assert_eq!(positions, vec![0]); - let positions = decomp.get_range(0..=105); + let positions = decomp.get_between_vals(0..=105); assert_eq!(positions, vec![0]); } @@ -619,14 +615,14 @@ mod tests { 1_000_000, 5_000_000_000, ]; - let compressor = CompactSpaceCompressor::train_from(VecColumn::from(vals)); - let data = compressor.compress(vals.iter().cloned()).unwrap(); - let decomp = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); + let mut out = Vec::new(); + serialize_u128(VecColumn::from(vals), &mut out).unwrap(); + let decomp = open_u128(OwnedBytes::new(out)).unwrap(); - assert_eq!(decomp.get_range(199..=200), vec![0]); - assert_eq!(decomp.get_range(199..=201), vec![0, 1]); - assert_eq!(decomp.get_range(200..=200), vec![0]); - assert_eq!(decomp.get_range(1_000_000..=1_000_000), vec![11]); + assert_eq!(decomp.get_between_vals(199..=200), vec![0]); + assert_eq!(decomp.get_between_vals(199..=201), vec![0, 1]); + assert_eq!(decomp.get_between_vals(200..=200), vec![0]); + assert_eq!(decomp.get_between_vals(1_000_000..=1_000_000), vec![11]); } #[test] diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index ab3842cb4..dc1e601fb 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -11,7 +11,9 @@ use std::io; use std::io::Write; use std::sync::Arc; +use column::ColumnExt; use common::BinarySerializable; +use compact_space::CompactSpaceDecompressor; use ownedbytes::OwnedBytes; use serialize::Header; @@ -29,11 +31,12 @@ mod serialize; use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; pub use self::column::{monotonic_map_column, Column, VecColumn}; -pub use self::compact_space::{ip_to_u128, CompactSpaceCompressor, CompactSpaceDecompressor}; +pub use self::compact_space::ip_to_u128; use self::linear::LinearCodec; pub use self::monotonic_mapping::MonotonicallyMappableToU64; -use self::serialize::NormalizedHeader; -pub use self::serialize::{estimate, serialize, serialize_and_load}; +pub use self::serialize::{ + estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader, +}; #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] #[repr(u8)] @@ -71,6 +74,11 @@ impl FastFieldCodecType { } } +/// Returns the correct codec reader wrapped in the `Arc` for the data. +pub fn open_u128(bytes: OwnedBytes) -> io::Result>> { + Ok(Arc::new(CompactSpaceDecompressor::open(bytes)?)) +} + /// Returns the correct codec reader wrapped in the `Arc` for the data. pub fn open( mut bytes: OwnedBytes, @@ -401,13 +409,10 @@ mod bench { (major_item as u128, minor_item as u128, permutation) } fn get_u128_column(data: &[u128]) -> Arc> { - let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&data)); - let data = compressor.compress(data.iter().cloned()).unwrap(); - let data = OwnedBytes::new(data); - - let column: Arc> = - Arc::new(CompactSpaceDecompressor::open(data).unwrap()); - column + let mut out = vec![]; + serialize_u128(VecColumn::from(&data), &mut out).unwrap(); + let out = OwnedBytes::new(out); + open_u128(out).unwrap() } #[bench] diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 3ec4cf15d..28de82f50 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -7,8 +7,7 @@ use std::net::{IpAddr, Ipv6Addr}; use std::str::FromStr; use fastfield_codecs::{ - Column, CompactSpaceCompressor, CompactSpaceDecompressor, FastFieldCodecType, FastFieldStats, - VecColumn, + open_u128, serialize_u128, Column, FastFieldCodecType, FastFieldStats, VecColumn, }; use itertools::Itertools; use measure_time::print_time; @@ -92,11 +91,8 @@ fn bench_ip() { // Chunks { let mut data = vec![]; - for dataset in dataset.chunks(50_000) { - let compressor = CompactSpaceCompressor::train_from(VecColumn::from(dataset)); - compressor - .compress_into(dataset.iter().cloned(), &mut data) - .unwrap(); + for dataset in dataset.chunks(500_000) { + serialize_u128(VecColumn::from(dataset), &mut data).unwrap(); } let compression = data.len() as f64 / (dataset.len() * 16) as f64; println!("Compression 50_000 chunks {:.4}", compression); @@ -106,8 +102,8 @@ fn bench_ip() { ); } - let compressor = CompactSpaceCompressor::train_from(VecColumn::from(&dataset)); - let data = compressor.compress(dataset.iter().cloned()).unwrap(); + let mut data = vec![]; + serialize_u128(VecColumn::from(&dataset), &mut data).unwrap(); let compression = data.len() as f64 / (dataset.len() * 16) as f64; println!("Compression {:.2}", compression); @@ -116,11 +112,11 @@ fn bench_ip() { (data.len() * 8) as f32 / dataset.len() as f32 ); - let decompressor = CompactSpaceDecompressor::open(OwnedBytes::new(data)).unwrap(); + let decompressor = open_u128(OwnedBytes::new(data)).unwrap(); // Sample some ranges for value in dataset.iter().take(1110).skip(1100).cloned() { print_time!("get range"); - let doc_values = decompressor.get_range(value..=value); + let doc_values = decompressor.get_between_vals(value..=value); println!("{:?}", doc_values.len()); } } diff --git a/fastfield_codecs/src/serialize.rs b/fastfield_codecs/src/serialize.rs index abb82eeb1..92f55f5d0 100644 --- a/fastfield_codecs/src/serialize.rs +++ b/fastfield_codecs/src/serialize.rs @@ -28,6 +28,7 @@ use ownedbytes::OwnedBytes; use crate::bitpacked::BitpackedCodec; use crate::blockwise_linear::BlockwiseLinearCodec; +use crate::compact_space::CompactSpaceCompressor; use crate::linear::LinearCodec; use crate::{ monotonic_map_column, Column, FastFieldCodec, FastFieldCodecType, MonotonicallyMappableToU64, @@ -141,6 +142,19 @@ pub fn estimate( } } +pub fn serialize_u128( + typed_column: impl Column, + output: &mut impl io::Write, +) -> io::Result<()> { + // TODO write header, to later support more codecs + let compressor = CompactSpaceCompressor::train_from(&typed_column); + compressor + .compress_into(typed_column.iter(), output) + .unwrap(); + + Ok(()) +} + pub fn serialize( typed_column: impl Column, output: &mut impl io::Write, From 12856d80fa88034c2d3e04cf073cb3d6d6f109da Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 16 Sep 2022 15:42:59 +0800 Subject: [PATCH 23/26] change bench, update numbers --- fastfield_codecs/src/compact_space/mod.rs | 6 +++--- fastfield_codecs/src/lib.rs | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index aea1933f6..cb5b83fb3 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -349,10 +349,10 @@ impl CompactSpaceDecompressor { self.params.compact_space.compact_to_u128(compact) } - /// Comparing on compact space: 1.08 GElements/s, which equals a throughput of 17,3 Gb/s - /// (based on u128 = 16byte) + /// Comparing on compact space: Random dataset 0,24 (50% random hit) - 1.05 GElements/s + /// Comparing on compact space: Real dataset 1.08 GElements/s /// - /// Comparing on original space: .06 GElements/s (not completely optimized) + /// Comparing on original space: Real dataset .06 GElements/s (not completely optimized) pub fn get_between_vals(&self, range: RangeInclusive) -> Vec { if range.start() > range.end() { return Vec::new(); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index dc1e601fb..85fbb1558 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -394,15 +394,27 @@ mod bench { }); } + fn get_exp_data() -> Vec { + let mut data = vec![]; + for i in 0..100 { + let num = i * i; + data.extend(iter::repeat(i as u64).take(num)); + } + data.shuffle(&mut StdRng::from_seed([1u8; 32])); + + // lengt = 328350 + data + } + fn get_u128_column_permutation() -> Arc> { let permutation = generate_permutation(); let permutation = permutation.iter().map(|el| *el as u128).collect::>(); get_u128_column(&permutation) } fn get_data_50percent_item() -> (u128, u128, Vec) { - let mut permutation = generate_permutation(); - let major_item = permutation[0]; - let minor_item = permutation[1]; + let mut permutation = get_exp_data(); + let major_item = 20; + let minor_item = 10; permutation.extend(iter::repeat(major_item).take(permutation.len())); permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); let permutation = permutation.iter().map(|el| *el as u128).collect::>(); From b8d8fdeb6ec65f77cf24dc31ab1322c3684de387 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 16 Sep 2022 16:29:11 +0800 Subject: [PATCH 24/26] move benches, improve bench data --- fastfield_codecs/benches/bench.rs | 213 ++++++++++++++++++++++++++++++ fastfield_codecs/src/lib.rs | 208 +---------------------------- fastfield_codecs/src/main.rs | 4 +- 3 files changed, 215 insertions(+), 210 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 5a4227dca..cabf2e824 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -4,9 +4,222 @@ extern crate test; #[cfg(test)] mod tests { + use std::iter; use std::sync::Arc; use fastfield_codecs::*; + use rand::prelude::*; + + use super::*; + + // Warning: this generates the same permutation at each call + fn generate_permutation() -> Vec { + let mut permutation: Vec = (0u64..100_000u64).collect(); + permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); + permutation + } + + fn generate_random() -> Vec { + let mut permutation: Vec = (0u64..100_000u64) + .map(|el| el + random::() as u64) + .collect(); + permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); + permutation + } + + // Warning: this generates the same permutation at each call + fn generate_permutation_gcd() -> Vec { + let mut permutation: Vec = (1u64..100_000u64).map(|el| el * 1000).collect(); + permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); + permutation + } + + pub fn serialize_and_load( + column: &[T], + ) -> Arc> { + let mut buffer = Vec::new(); + serialize(VecColumn::from(&column), &mut buffer, &ALL_CODEC_TYPES).unwrap(); + open(OwnedBytes::new(buffer)).unwrap() + } + + #[bench] + fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + b.iter(|| { + let mut a = 0u64; + for _ in 0..n { + a = permutation[a as usize]; + } + a + }); + } + + #[bench] + fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + let column: Arc> = serialize_and_load(&permutation); + b.iter(|| { + let mut a = 0u64; + for _ in 0..n { + a = column.get_val(a as u64); + } + a + }); + } + + fn get_exp_data() -> Vec { + let mut data = vec![]; + for i in 0..100 { + let num = i * i; + data.extend(iter::repeat(i as u64).take(num)); + } + data.shuffle(&mut StdRng::from_seed([1u8; 32])); + + // lengt = 328350 + data + } + + fn get_data_50percent_item() -> (u128, u128, Vec) { + let mut permutation = get_exp_data(); + let major_item = 20; + let minor_item = 10; + permutation.extend(iter::repeat(major_item).take(permutation.len())); + permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); + let permutation = permutation.iter().map(|el| *el as u128).collect::>(); + (major_item as u128, minor_item as u128, permutation) + } + fn get_u128_column_random() -> Arc> { + let permutation = generate_random(); + let permutation = permutation.iter().map(|el| *el as u128).collect::>(); + get_u128_column_from_data(&permutation) + } + + fn get_u128_column_from_data(data: &[u128]) -> Arc> { + let mut out = vec![]; + serialize_u128(VecColumn::from(&data), &mut out).unwrap(); + let out = OwnedBytes::new(out); + open_u128(out).unwrap() + } + + #[bench] + fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) { + let (major_item, _minor_item, data) = get_data_50percent_item(); + let column = get_u128_column_from_data(&data); + + b.iter(|| column.get_between_vals(major_item..=major_item)); + } + + #[bench] + fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) { + let (_major_item, minor_item, data) = get_data_50percent_item(); + let column = get_u128_column_from_data(&data); + + b.iter(|| column.get_between_vals(minor_item..=minor_item)); + } + + #[bench] + fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) { + let (_major_item, _minor_item, data) = get_data_50percent_item(); + let column = get_u128_column_from_data(&data); + + b.iter(|| column.get_between_vals(0..=u128::MAX)); + } + + #[bench] + fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) { + let column = get_u128_column_random(); + + b.iter(|| { + let mut a = 0u128; + for i in 0u64..column.num_vals() as u64 { + a += column.get_val(i); + } + a + }); + } + + #[bench] + fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) { + let column = get_u128_column_random(); + + b.iter(|| { + let n = column.num_vals(); + let mut a = 0u128; + for i in (0..n / 5).map(|val| val * 5) { + a += column.get_val(i as u64); + } + a + }); + } + + #[bench] + fn bench_intfastfield_stride7_vec(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + b.iter(|| { + let mut a = 0u64; + for i in (0..n / 7).map(|val| val * 7) { + a += permutation[i as usize]; + } + a + }); + } + + #[bench] + fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + let column: Arc> = serialize_and_load(&permutation); + b.iter(|| { + let mut a = 0u64; + for i in (0..n / 7).map(|val| val * 7) { + a += column.get_val(i as u64); + } + a + }); + } + + #[bench] + fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + let column: Arc> = serialize_and_load(&permutation); + b.iter(|| { + let mut a = 0u64; + for i in 0u64..n as u64 { + a += column.get_val(i); + } + a + }); + } + + #[bench] + fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) { + let permutation = generate_permutation_gcd(); + let n = permutation.len(); + let column: Arc> = serialize_and_load(&permutation); + b.iter(|| { + let mut a = 0u64; + for i in 0..n as u64 { + a += column.get_val(i); + } + a + }); + } + + #[bench] + fn bench_intfastfield_scan_all_vec(b: &mut Bencher) { + let permutation = generate_permutation(); + b.iter(|| { + let mut a = 0u64; + for i in 0..permutation.len() { + a += permutation[i as usize] as u64; + } + a + }); + } fn get_data() -> Vec { let mut rng = StdRng::seed_from_u64(2u64); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 85fbb1558..5971ce422 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -11,7 +11,6 @@ use std::io; use std::io::Write; use std::sync::Arc; -use column::ColumnExt; use common::BinarySerializable; use compact_space::CompactSpaceDecompressor; use ownedbytes::OwnedBytes; @@ -30,7 +29,7 @@ mod serialize; use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; -pub use self::column::{monotonic_map_column, Column, VecColumn}; +pub use self::column::{monotonic_map_column, Column, ColumnExt, VecColumn}; pub use self::compact_space::ip_to_u128; use self::linear::LinearCodec; pub use self::monotonic_mapping::MonotonicallyMappableToU64; @@ -340,208 +339,3 @@ mod tests { assert_eq!(count_codec, 3); } } - -#[cfg(all(test, feature = "unstable"))] -mod bench { - use std::iter; - use std::sync::Arc; - - use column::ColumnExt; - use rand::prelude::*; - use test::{self, Bencher}; - - use super::*; - use crate::Column; - - // Warning: this generates the same permutation at each call - fn generate_permutation() -> Vec { - let mut permutation: Vec = (0u64..100_000u64).collect(); - permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); - permutation - } - - // Warning: this generates the same permutation at each call - fn generate_permutation_gcd() -> Vec { - let mut permutation: Vec = (1u64..100_000u64).map(|el| el * 1000).collect(); - permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); - permutation - } - - #[bench] - fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) { - let permutation = generate_permutation(); - let n = permutation.len(); - b.iter(|| { - let mut a = 0u64; - for _ in 0..n { - a = permutation[a as usize]; - } - a - }); - } - - #[bench] - fn bench_intfastfield_jumpy_fflookup(b: &mut Bencher) { - let permutation = generate_permutation(); - let n = permutation.len(); - let column: Arc> = crate::serialize_and_load(&permutation); - b.iter(|| { - let mut a = 0u64; - for _ in 0..n { - a = column.get_val(a as u64); - } - a - }); - } - - fn get_exp_data() -> Vec { - let mut data = vec![]; - for i in 0..100 { - let num = i * i; - data.extend(iter::repeat(i as u64).take(num)); - } - data.shuffle(&mut StdRng::from_seed([1u8; 32])); - - // lengt = 328350 - data - } - - fn get_u128_column_permutation() -> Arc> { - let permutation = generate_permutation(); - let permutation = permutation.iter().map(|el| *el as u128).collect::>(); - get_u128_column(&permutation) - } - fn get_data_50percent_item() -> (u128, u128, Vec) { - let mut permutation = get_exp_data(); - let major_item = 20; - let minor_item = 10; - permutation.extend(iter::repeat(major_item).take(permutation.len())); - permutation.shuffle(&mut StdRng::from_seed([1u8; 32])); - let permutation = permutation.iter().map(|el| *el as u128).collect::>(); - (major_item as u128, minor_item as u128, permutation) - } - fn get_u128_column(data: &[u128]) -> Arc> { - let mut out = vec![]; - serialize_u128(VecColumn::from(&data), &mut out).unwrap(); - let out = OwnedBytes::new(out); - open_u128(out).unwrap() - } - - #[bench] - fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) { - let (major_item, _minor_item, data) = get_data_50percent_item(); - let column = get_u128_column(&data); - - b.iter(|| column.get_between_vals(major_item..=major_item)); - } - - #[bench] - fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) { - let (_major_item, minor_item, data) = get_data_50percent_item(); - let column = get_u128_column(&data); - - b.iter(|| column.get_between_vals(minor_item..=minor_item)); - } - - #[bench] - fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) { - let (_major_item, _minor_item, data) = get_data_50percent_item(); - let column = get_u128_column(&data); - - b.iter(|| column.get_between_vals(0..=u128::MAX)); - } - - #[bench] - fn bench_intfastfield_jumpy_fflookup_u128(b: &mut Bencher) { - let column = get_u128_column_permutation(); - - b.iter(|| { - let mut a = 0u128; - for _ in 0..column.num_vals() { - a = column.get_val(a as u64); - } - a - }); - } - - #[bench] - fn bench_intfastfield_jumpy_stride5_u128(b: &mut Bencher) { - let column = get_u128_column_permutation(); - - b.iter(|| { - let n = column.num_vals(); - let mut a = 0u128; - for i in (0..n / 5).map(|val| val * 5) { - a += column.get_val(i as u64); - } - a - }); - } - - #[bench] - fn bench_intfastfield_stride7_vec(b: &mut Bencher) { - let permutation = generate_permutation(); - let n = permutation.len(); - b.iter(|| { - let mut a = 0u64; - for i in (0..n / 7).map(|val| val * 7) { - a += permutation[i as usize]; - } - a - }); - } - - #[bench] - fn bench_intfastfield_stride7_fflookup(b: &mut Bencher) { - let permutation = generate_permutation(); - let n = permutation.len(); - let column: Arc> = crate::serialize_and_load(&permutation); - b.iter(|| { - let mut a = 0u64; - for i in (0..n / 7).map(|val| val * 7) { - a += column.get_val(i as u64); - } - a - }); - } - - #[bench] - fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) { - let permutation = generate_permutation(); - let n = permutation.len(); - let column: Arc> = crate::serialize_and_load(&permutation); - b.iter(|| { - let mut a = 0u64; - for i in 0u64..n as u64 { - a += column.get_val(i); - } - a - }); - } - - #[bench] - fn bench_intfastfield_scan_all_fflookup_gcd(b: &mut Bencher) { - let permutation = generate_permutation_gcd(); - let n = permutation.len(); - let column: Arc> = crate::serialize_and_load(&permutation); - b.iter(|| { - let mut a = 0u64; - for i in 0..n as u64 { - a += column.get_val(i); - } - a - }); - } - - #[bench] - fn bench_intfastfield_scan_all_vec(b: &mut Bencher) { - let permutation = generate_permutation(); - b.iter(|| { - let mut a = 0u64; - for i in 0..permutation.len() { - a += permutation[i as usize] as u64; - } - a - }); - } -} diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs index 28de82f50..d3d9c06f8 100644 --- a/fastfield_codecs/src/main.rs +++ b/fastfield_codecs/src/main.rs @@ -6,9 +6,7 @@ use std::io::BufRead; use std::net::{IpAddr, Ipv6Addr}; use std::str::FromStr; -use fastfield_codecs::{ - open_u128, serialize_u128, Column, FastFieldCodecType, FastFieldStats, VecColumn, -}; +use fastfield_codecs::{open_u128, serialize_u128, Column, FastFieldCodecType, VecColumn}; use itertools::Itertools; use measure_time::print_time; use ownedbytes::OwnedBytes; From a16b46646077b5027f2039af6cc1e2046aacb1db Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 16 Sep 2022 18:15:18 +0800 Subject: [PATCH 25/26] merge ColumnExt with Column trait --- fastfield_codecs/benches/bench.rs | 4 +-- fastfield_codecs/src/column.rs | 30 +++++++++++++++-------- fastfield_codecs/src/compact_space/mod.rs | 4 --- fastfield_codecs/src/lib.rs | 4 +-- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index cabf2e824..87e9c8baa 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -90,13 +90,13 @@ mod tests { let permutation = permutation.iter().map(|el| *el as u128).collect::>(); (major_item as u128, minor_item as u128, permutation) } - fn get_u128_column_random() -> Arc> { + fn get_u128_column_random() -> Arc> { let permutation = generate_random(); let permutation = permutation.iter().map(|el| *el as u128).collect::>(); get_u128_column_from_data(&permutation) } - fn get_u128_column_from_data(data: &[u128]) -> Arc> { + fn get_u128_column_from_data(data: &[u128]) -> Arc> { let mut out = vec![]; serialize_u128(VecColumn::from(&data), &mut out).unwrap(); let out = OwnedBytes::new(out); diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 424b29f61..eb73be542 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -4,7 +4,7 @@ use std::sync::Mutex; use tantivy_bitpacker::minmax; -pub trait Column: Send + Sync { +pub trait Column: Send + Sync { /// Return the value associated to the given idx. /// /// This accessor should return as fast as possible. @@ -29,6 +29,19 @@ pub trait Column: Send + Sync { } } + /// Return the positions of values which are in the provided range. + #[inline] + fn get_between_vals(&self, range: RangeInclusive) -> Vec { + let mut vals = Vec::new(); + for idx in 0..self.num_vals() { + let val = self.get_val(idx); + if range.contains(&val) { + vals.push(idx); + } + } + vals + } + /// Returns the minimum value for this fast field. /// /// This min_value may not be exact. @@ -53,12 +66,6 @@ pub trait Column: Send + Sync { } } -/// Extend Column Api -pub trait ColumnExt: Column { - /// Return the positions of values which are in the provided range. - fn get_between_vals(&self, range: RangeInclusive) -> Vec; -} - pub struct VecColumn<'a, T = u64> { values: &'a [T], min_value: T, @@ -138,7 +145,7 @@ struct MonotonicMappingColumn { } /// Creates a view of a column transformed by a monotonic mapping. -pub fn monotonic_map_column( +pub fn monotonic_map_column( from_column: C, monotonic_mapping: T, ) -> impl Column @@ -155,7 +162,8 @@ where } } -impl Column for MonotonicMappingColumn +impl Column + for MonotonicMappingColumn where C: Column, T: Fn(Input) -> Output + Send + Sync, @@ -224,7 +232,9 @@ where T: Iterator + Clone + ExactSizeIterator } impl Column for IterColumn -where T: Iterator + Clone + ExactSizeIterator + Send + Sync +where + T: Iterator + Clone + ExactSizeIterator + Send + Sync, + T::Item: PartialOrd, { fn get_val(&self, idx: u64) -> T::Item { self.0.clone().nth(idx as usize).unwrap() diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index cb5b83fb3..aca7625ad 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -22,7 +22,6 @@ use common::{BinarySerializable, CountingWriter, VInt, VIntU128}; use ownedbytes::OwnedBytes; use tantivy_bitpacker::{self, BitPacker, BitUnpacker}; -use crate::column::ColumnExt; use crate::compact_space::build_compact_space::get_compact_space; use crate::Column; @@ -314,9 +313,6 @@ impl Column for CompactSpaceDecompressor { fn iter<'a>(&'a self) -> Box + 'a> { Box::new(self.iter()) } -} - -impl ColumnExt for CompactSpaceDecompressor { fn get_between_vals(&self, range: RangeInclusive) -> Vec { self.get_between_vals(range) } diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 5971ce422..3168543ea 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -29,7 +29,7 @@ mod serialize; use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; -pub use self::column::{monotonic_map_column, Column, ColumnExt, VecColumn}; +pub use self::column::{monotonic_map_column, Column, VecColumn}; pub use self::compact_space::ip_to_u128; use self::linear::LinearCodec; pub use self::monotonic_mapping::MonotonicallyMappableToU64; @@ -74,7 +74,7 @@ impl FastFieldCodecType { } /// Returns the correct codec reader wrapped in the `Arc` for the data. -pub fn open_u128(bytes: OwnedBytes) -> io::Result>> { +pub fn open_u128(bytes: OwnedBytes) -> io::Result>> { Ok(Arc::new(CompactSpaceDecompressor::open(bytes)?)) } From 02599ebeb7f3b16f45aa6576fafbb1dbcb30caca Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 16 Sep 2022 18:16:16 +0800 Subject: [PATCH 26/26] remove ip_to_u128 --- fastfield_codecs/src/compact_space/mod.rs | 9 --------- fastfield_codecs/src/lib.rs | 1 - 2 files changed, 10 deletions(-) diff --git a/fastfield_codecs/src/compact_space/mod.rs b/fastfield_codecs/src/compact_space/mod.rs index aca7625ad..d137b30f4 100644 --- a/fastfield_codecs/src/compact_space/mod.rs +++ b/fastfield_codecs/src/compact_space/mod.rs @@ -14,7 +14,6 @@ use std::{ cmp::Ordering, collections::BTreeSet, io::{self, Write}, - net::{IpAddr, Ipv6Addr}, ops::RangeInclusive, }; @@ -28,14 +27,6 @@ use crate::Column; mod blank_range; mod build_compact_space; -pub fn ip_to_u128(ip_addr: IpAddr) -> u128 { - let ip_addr_v6: Ipv6Addr = match ip_addr { - IpAddr::V4(v4) => v4.to_ipv6_mapped(), - IpAddr::V6(v6) => v6, - }; - u128::from_be_bytes(ip_addr_v6.octets()) -} - /// The cost per blank is quite hard actually, since blanks are delta encoded, the actual cost of /// blanks depends on the number of blanks. /// diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 3168543ea..88bc2953d 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -30,7 +30,6 @@ mod serialize; use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; pub use self::column::{monotonic_map_column, Column, VecColumn}; -pub use self::compact_space::ip_to_u128; use self::linear::LinearCodec; pub use self::monotonic_mapping::MonotonicallyMappableToU64; pub use self::serialize::{