diff --git a/fastfield_codecs/Cargo.toml b/fastfield_codecs/Cargo.toml index a53a8ceeb..a254706db 100644 --- a/fastfield_codecs/Cargo.toml +++ b/fastfield_codecs/Cargo.toml @@ -18,6 +18,7 @@ fastdivide = "0.4" log = "0.4" itertools = { version = "0.10.3" } measure_time = { version="0.8.2", optional=true} +roaring = "0.10.1" [dev-dependencies] more-asserts = "0.3.0" diff --git a/fastfield_codecs/benches/bench.rs b/fastfield_codecs/benches/bench.rs index 526036d4a..ec653064d 100644 --- a/fastfield_codecs/benches/bench.rs +++ b/fastfield_codecs/benches/bench.rs @@ -183,6 +183,36 @@ mod tests { }); } + #[bench] + fn bench_intfastfield_stride7_fflookup_sparse_roaring(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + let column: Arc> = serialize_and_load(&permutation); + let column = SparseCodecRoaringBitmap::with_full(column); + b.iter(|| { + let mut a = 0u64; + for i in (0..n / 7).map(|val| val * 7) { + a += column.get_val(i as u64); + } + a + }); + } + + #[bench] + fn bench_intfastfield_stride7_fflookup_dense_bitmap_with_offset(b: &mut Bencher) { + let permutation = generate_permutation(); + let n = permutation.len(); + let column: Arc> = serialize_and_load(&permutation); + let column = DenseCodec::with_full(column); + b.iter(|| { + let mut a = 0u64; + for i in (0..n / 7).map(|val| val * 7) { + a += column.get_val(i as u64); + } + a + }); + } + #[bench] fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) { let permutation = generate_permutation(); diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index 4205a323a..3f97abe9a 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -22,11 +22,14 @@ mod compact_space; mod line; mod linear; mod monotonic_mapping; +mod sparse_codec_wrapper; mod column; mod gcd; mod serialize; +pub use sparse_codec_wrapper::{DenseCodec, SparseCodecRoaringBitmap}; + use self::bitpacked::BitpackedCodec; use self::blockwise_linear::BlockwiseLinearCodec; pub use self::column::{monotonic_map_column, Column, VecColumn}; @@ -440,6 +443,7 @@ mod bench { let data: Vec<_> = get_data(); bench_get::(b, &data); } + #[bench] fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) { let data: Vec<_> = get_data(); diff --git a/fastfield_codecs/src/sparse_codec_wrapper.rs b/fastfield_codecs/src/sparse_codec_wrapper.rs new file mode 100644 index 000000000..a69a10457 --- /dev/null +++ b/fastfield_codecs/src/sparse_codec_wrapper.rs @@ -0,0 +1,125 @@ +use std::sync::Arc; + +use roaring::RoaringBitmap; + +use crate::Column; + +pub struct SparseCodecRoaringBitmap { + null: RoaringBitmap, + column: Arc>, // column: C, +} + +impl SparseCodecRoaringBitmap { + pub fn with_full(column: Arc>) -> Self { + let mut rb = RoaringBitmap::new(); + rb.insert_range(0..column.num_vals() as u32 + 1); + Self { null: rb, column } + } +} + +impl Column for SparseCodecRoaringBitmap { + fn get_val(&self, idx: u64) -> u64 { + let position_of_val = self.null.rank(idx as u32); + self.column.get_val(position_of_val) // TODO this does not handle null! + // self.null.select(num_vals) + } + + fn min_value(&self) -> u64 { + todo!() + } + + fn max_value(&self) -> u64 { + todo!() + } + + fn num_vals(&self) -> u64 { + todo!() + } +} + +pub struct DenseCodec { + // the bitmap blocks of length 64 bit each + blocks: Vec, + // the offset for each block + offsets: Vec, + column: Arc>, // column: C, +} + +impl DenseCodec { + pub fn with_full(column: Arc>) -> Self { + let num_blocks = (column.num_vals() as usize / 64) + 1; + let mut blocks = Vec::with_capacity(num_blocks); + let mut offsets = Vec::with_capacity(num_blocks); + // fill all blocks + let mut offset = 0; + for _block_num in 0..num_blocks { + let block = u64::MAX; + blocks.push(block); + offsets.push(offset); + offset += block.count_ones(); + } + + Self { + blocks, + offsets, + column, + } + } +} + +fn gen_mask(msb: u64) -> u64 { + let src = 1 << msb; + src - 1 +} + +fn get_bit_at(input: u64, n: u64) -> bool { + input & (1 << n) != 0 +} + +impl Column for DenseCodec { + fn get_val(&self, idx: u64) -> u64 { + let block_pos = idx / 64; + let pos_in_block = idx % 64; + let offset = self.offsets[block_pos as usize]; + let bitvec = self.blocks[block_pos as usize]; + let offset_in_block = (bitvec & gen_mask(pos_in_block)).count_ones(); + let dense_idx = offset as u64 + offset_in_block as u64; + if get_bit_at(bitvec, pos_in_block) { + self.column.get_val(dense_idx) + } else { + 0 // TODO null + } + } + + fn min_value(&self) -> u64 { + todo!() + } + + fn max_value(&self) -> u64 { + todo!() + } + + fn num_vals(&self) -> u64 { + todo!() + } +} + +#[cfg(test)] +mod tests { + use itertools::Itertools; + + use super::*; + use crate::serialize_and_load; + + #[test] + fn dense_test() { + let data = (0..100u64).collect_vec(); + { + let column = serialize_and_load(&data); + let dense = DenseCodec::with_full(column); + for i in 0..100 { + dense.get_val(i); + } + } + } +}