diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 7a12af8bc..ae9cb34b7 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -37,12 +37,15 @@ mod line; mod linear; mod monotonic_mapping; mod monotonic_mapping_u128; +mod null_index; mod null_index_footer; mod column; mod gcd; mod serialize; +pub use null_index::*; + use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn}; diff --git a/fastfield_codecs/src/null_index/dense.rs b/fastfield_codecs/src/null_index/dense.rs new file mode 100644 index 000000000..294027d2d --- /dev/null +++ b/fastfield_codecs/src/null_index/dense.rs @@ -0,0 +1,366 @@ +use std::io::{self, Write}; + +use common::BinarySerializable; +use itertools::Itertools; +use ownedbytes::OwnedBytes; + +use super::{get_bit_at, set_bit_at}; + +/// For the `DenseCodec`, `data` which contains the encoded blocks. +/// Each block consists of [u8; 12]. The first 8 bytes is a bitvec for 64 elements. +/// The last 4 bytes are the offset, the number of set bits so far. +/// +/// When translating the original index to a dense index, the correct block can be computed +/// directly `orig_idx/64`. Inside the block the position is `orig_idx%64`. +/// +/// When translating a dense index to the original index, we can use the offset to find the correct +/// block. Direct computation is not possible, but we can employ a linear or binary search. +pub struct DenseCodec { + // data consists of blocks of 64 bits. + // + // The format is &[(u64, u32)] + // u64 is the bitvec + // u32 is the offset of the block, the number of set bits so far. + // + // At the end one block is appended, to store the number of values in the index in offset. + data: OwnedBytes, +} +const ELEMENTS_PER_BLOCK: u32 = 64; +const BLOCK_BITVEC_SIZE: usize = 8; +const BLOCK_OFFSET_SIZE: usize = 4; +const SERIALIZED_BLOCK_SIZE: usize = BLOCK_BITVEC_SIZE + BLOCK_OFFSET_SIZE; + +fn count_ones(block: u64, pos_in_block: u32) -> u32 { + if pos_in_block == 63 { + block.count_ones() + } else { + let mask = (1u64 << (pos_in_block + 1)) - 1; + let masked_block = block & mask; + masked_block.count_ones() + } +} + +impl DenseCodec { + /// Open the DenseCodec from OwnedBytes + pub fn open(data: OwnedBytes) -> Self { + Self { data } + } + #[inline] + /// Check if value at position is not null. + pub fn exists(&self, idx: u32) -> bool { + let block_pos = idx / ELEMENTS_PER_BLOCK; + let bitvec: u64 = self.block(block_pos as usize); + + let pos_in_block = idx % ELEMENTS_PER_BLOCK; + + get_bit_at(bitvec, pos_in_block) + } + #[inline] + pub(crate) fn block(&self, block_pos: usize) -> u64 { + let data = &mut &self.data[block_pos as usize * SERIALIZED_BLOCK_SIZE..]; + + let block: u64 = + BinarySerializable::deserialize(data).expect("could not read block in null index"); + block + } + + #[inline] + /// Returns (bitvec, offset) + /// + /// offset is the start offset of actual docids in the block. + pub(crate) fn block_and_offset(&self, block_pos: u32) -> (u64, u32) { + let data = &mut &self.data[block_pos as usize * SERIALIZED_BLOCK_SIZE..]; + + let block: u64 = + BinarySerializable::deserialize(data).expect("could not read block in null index"); + let offset: u32 = + BinarySerializable::deserialize(data).expect("could not read block in null index"); + (block, offset) + } + + /// Return the number of non-null values in an index + pub fn num_non_null_vals(&self) -> u32 { + let last_block = (self.data.len() / SERIALIZED_BLOCK_SIZE) - 1; + self.block_and_offset(last_block as u32).1 + } + + #[inline] + /// Translate from the original index to the codec index. + pub fn translate_to_codec_idx(&self, idx: u32) -> Option { + let block_pos = idx / ELEMENTS_PER_BLOCK; + let (block, offset) = self.block_and_offset(block_pos); + let pos_in_block = idx % ELEMENTS_PER_BLOCK; + if get_bit_at(block, pos_in_block) { + let ones_in_block = count_ones(block, pos_in_block); + Some(offset + ones_in_block - 1) // -1 is ok, since idx does exist, so there's at least + // one + } else { + None + } + } + + /// Translate positions from the codec index to the original index. + /// + /// # Panics + /// + /// May panic if any `idx` is greater than the column length. + pub fn translate_codec_idx_to_original_idx<'a>( + &'a self, + iter: impl Iterator + 'a, + ) -> impl Iterator + 'a { + let mut block_pos = 0u32; + iter.map(move |dense_idx| { + // update block_pos to limit search scope + block_pos = find_block(dense_idx, block_pos, &self.data); + let (bitvec, offset) = self.block_and_offset(block_pos); + + // The next offset is higher than dense_idx and therefore: + // dense_idx <= offset + num_set_bits in block + let mut num_set_bits = 0; + for idx_in_block in 0..ELEMENTS_PER_BLOCK { + if get_bit_at(bitvec, idx_in_block) { + num_set_bits += 1; + } + if num_set_bits == (dense_idx - offset + 1) { + let orig_idx = block_pos * ELEMENTS_PER_BLOCK + idx_in_block as u32; + return orig_idx; + } + } + panic!("Internal Error: Offset calculation in dense idx seems to be wrong."); + }) + } +} + +#[inline] +/// Finds the block position containing the dense_idx. +/// +/// # Correctness +/// dense_idx needs to be smaller than the number of values in the index +/// +/// The last offset number is equal to the number of values in the index. +fn find_block(dense_idx: u32, mut block_pos: u32, data: &[u8]) -> u32 { + loop { + let data = &mut &data[BLOCK_BITVEC_SIZE + block_pos as usize * SERIALIZED_BLOCK_SIZE..]; + let offset: u32 = BinarySerializable::deserialize(data) + .expect("could not read offset from block in null index"); + if offset > dense_idx { + // offset + return block_pos - 1; + } + block_pos += 1; + } +} + +/// Iterator over all values, true if set, otherwise false +pub fn serialize_dense_codec( + iter: impl Iterator, + mut out: impl Write, +) -> io::Result<()> { + let mut offset: u32 = 0; + + for chunk in &iter.chunks(ELEMENTS_PER_BLOCK as usize) { + let mut block: u64 = 0; + for (pos, is_bit_set) in chunk.enumerate() { + if is_bit_set { + set_bit_at(&mut block, pos as u64); + } + } + + block.serialize(&mut out)?; + offset.serialize(&mut out)?; + + offset += block.count_ones() as u32; + } + // Add sentinal block for the offset + let block: u64 = 0; + block.serialize(&mut out)?; + offset.serialize(&mut out)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use proptest::prelude::{any, prop, *}; + use proptest::strategy::Strategy; + use proptest::{prop_oneof, proptest}; + + use super::*; + + fn random_bitvec() -> BoxedStrategy> { + prop_oneof![ + 1 => prop::collection::vec(proptest::bool::weighted(1.0), 0..100), + 1 => prop::collection::vec(proptest::bool::weighted(1.0), 0..64), + 1 => prop::collection::vec(proptest::bool::weighted(0.0), 0..100), + 1 => prop::collection::vec(proptest::bool::weighted(0.0), 0..64), + 8 => vec![any::()], + 2 => prop::collection::vec(any::(), 0..50), + ] + .boxed() + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(500))] + #[test] + fn test_with_random_bitvecs(bitvec1 in random_bitvec(), bitvec2 in random_bitvec(), bitvec3 in random_bitvec()) { + let mut bitvec = Vec::new(); + bitvec.extend_from_slice(&bitvec1); + bitvec.extend_from_slice(&bitvec2); + bitvec.extend_from_slice(&bitvec3); + test_null_index(bitvec); + } + } + + #[test] + fn dense_codec_test_one_block_false() { + let mut iter = vec![false; 64]; + iter.push(true); + test_null_index(iter); + } + + fn test_null_index(data: Vec) { + let mut out = vec![]; + + serialize_dense_codec(data.iter().cloned(), &mut out).unwrap(); + let null_index = DenseCodec::open(OwnedBytes::new(out)); + + let orig_idx_with_value: Vec = data + .iter() + .enumerate() + .filter(|(_pos, val)| **val) + .map(|(pos, _val)| pos as u32) + .collect(); + + assert_eq!( + null_index + .translate_codec_idx_to_original_idx(0..orig_idx_with_value.len() as u32) + .collect_vec(), + orig_idx_with_value + ); + + for (dense_idx, orig_idx) in orig_idx_with_value.iter().enumerate() { + assert_eq!( + null_index.translate_to_codec_idx(*orig_idx), + Some(dense_idx as u32) + ); + } + + for (pos, value) in data.iter().enumerate() { + assert_eq!(null_index.exists(pos as u32), *value); + } + } + + #[test] + fn dense_codec_test_translation() { + let mut out = vec![]; + + let iter = ([true, false, true, false]).iter().cloned(); + serialize_dense_codec(iter, &mut out).unwrap(); + let null_index = DenseCodec::open(OwnedBytes::new(out)); + + assert_eq!( + null_index + .translate_codec_idx_to_original_idx(0..2) + .collect_vec(), + vec![0, 2] + ); + } + + #[test] + fn dense_codec_translate() { + let mut out = vec![]; + + let iter = ([true, false, true, false]).iter().cloned(); + serialize_dense_codec(iter, &mut out).unwrap(); + let null_index = DenseCodec::open(OwnedBytes::new(out)); + assert_eq!(null_index.translate_to_codec_idx(0), Some(0)); + assert_eq!(null_index.translate_to_codec_idx(2), Some(1)); + } + + #[test] + fn dense_codec_test_small() { + let mut out = vec![]; + + let iter = ([true, false, true, false]).iter().cloned(); + serialize_dense_codec(iter, &mut out).unwrap(); + let null_index = DenseCodec::open(OwnedBytes::new(out)); + assert!(null_index.exists(0)); + assert!(!null_index.exists(1)); + assert!(null_index.exists(2)); + assert!(!null_index.exists(3)); + } + + #[test] + fn dense_codec_test_large() { + let mut docs = vec![]; + docs.extend((0..1000).map(|_idx| false)); + docs.extend((0..=1000).map(|_idx| true)); + + let iter = docs.iter().cloned(); + let mut out = vec![]; + serialize_dense_codec(iter, &mut out).unwrap(); + let null_index = DenseCodec::open(OwnedBytes::new(out)); + assert!(!null_index.exists(0)); + assert!(!null_index.exists(100)); + assert!(!null_index.exists(999)); + assert!(null_index.exists(1000)); + assert!(null_index.exists(1999)); + assert!(null_index.exists(2000)); + assert!(!null_index.exists(2001)); + } + + #[test] + fn test_count_ones() { + let mut block = 0; + set_bit_at(&mut block, 0); + set_bit_at(&mut block, 2); + + assert_eq!(count_ones(block, 0), 1); + assert_eq!(count_ones(block, 1), 1); + assert_eq!(count_ones(block, 2), 2); + } +} + +#[cfg(all(test, feature = "unstable"))] +mod bench { + + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + use test::Bencher; + + use super::*; + + fn gen_bools() -> DenseCodec { + let mut out = Vec::new(); + let mut rng: StdRng = StdRng::from_seed([1u8; 32]); + // 80% of values are set + let bools: Vec<_> = (0..100_000).map(|_| rng.gen_bool(8f64 / 10f64)).collect(); + serialize_dense_codec(bools.into_iter(), &mut out).unwrap(); + + let codec = DenseCodec::open(OwnedBytes::new(out)); + codec + } + + #[bench] + fn bench_dense_codec_translate_orig_to_dense(bench: &mut Bencher) { + let codec = gen_bools(); + bench.iter(|| { + let mut dense_idx: Option = None; + for idx in 0..100_000 { + dense_idx = codec.translate_to_codec_idx(idx); + } + dense_idx + }); + } + + #[bench] + fn bench_dense_codec_translate_dense_to_orig(bench: &mut Bencher) { + let codec = gen_bools(); + let num_vals = codec.num_non_null_vals(); + bench.iter(|| { + codec + .translate_codec_idx_to_original_idx(0..num_vals) + .last() + }); + } +} diff --git a/fastfield_codecs/src/null_index/mod.rs b/fastfield_codecs/src/null_index/mod.rs new file mode 100644 index 000000000..9a3e22bf0 --- /dev/null +++ b/fastfield_codecs/src/null_index/mod.rs @@ -0,0 +1,11 @@ +pub use dense::{serialize_dense_codec, DenseCodec}; + +mod dense; + +fn get_bit_at(input: u64, n: u32) -> bool { + input & (1 << n) != 0 +} + +fn set_bit_at(input: &mut u64, n: u64) { + *input |= 1 << n; +}