mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-01 23:12:54 +00:00
Compare commits
2 Commits
commit-cha
...
sparse_cod
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4c7437f2e0 | ||
|
|
cc82f94c72 |
@@ -18,6 +18,7 @@ fastdivide = "0.4"
|
|||||||
log = "0.4"
|
log = "0.4"
|
||||||
itertools = { version = "0.10.3" }
|
itertools = { version = "0.10.3" }
|
||||||
measure_time = { version="0.8.2", optional=true}
|
measure_time = { version="0.8.2", optional=true}
|
||||||
|
roaring = "0.10.1"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
more-asserts = "0.3.0"
|
more-asserts = "0.3.0"
|
||||||
|
|||||||
@@ -44,6 +44,15 @@ mod tests {
|
|||||||
open(OwnedBytes::new(buffer)).unwrap()
|
open(OwnedBytes::new(buffer)).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn serialize_and_load_dense<T: MonotonicallyMappableToU64 + Ord + Default>(
|
||||||
|
column: &[T],
|
||||||
|
fill_ratio: u32,
|
||||||
|
) -> Arc<dyn Column<T>> {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
serialize(VecColumn::from(&column), &mut buffer, &ALL_CODEC_TYPES).unwrap();
|
||||||
|
open_dense(OwnedBytes::new(buffer), fill_ratio).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
|
fn bench_intfastfield_jumpy_veclookup(b: &mut Bencher) {
|
||||||
let permutation = generate_permutation();
|
let permutation = generate_permutation();
|
||||||
@@ -183,6 +192,67 @@ mod tests {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_intfastfield_stride7_fflookup_sparse_roaring(b: &mut Bencher) {
|
||||||
|
let permutation = generate_permutation();
|
||||||
|
let n = permutation.len();
|
||||||
|
let column: Arc<dyn Column<u64>> = serialize_and_load(&permutation);
|
||||||
|
let column = SparseCodecRoaringBitmap::with_full(column);
|
||||||
|
b.iter(|| {
|
||||||
|
let mut a = 0u64;
|
||||||
|
for i in (0..n / 7).map(|val| val * 7) {
|
||||||
|
a += column.get_val(i as u64);
|
||||||
|
}
|
||||||
|
a
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_intfastfield_stride7_fflookup_dense_bitmap_with_offset(b: &mut Bencher) {
|
||||||
|
let permutation = generate_permutation();
|
||||||
|
let n = permutation.len();
|
||||||
|
let column: Arc<dyn Column<u64>> = serialize_and_load_dense(&permutation, 1000);
|
||||||
|
b.iter(|| {
|
||||||
|
let mut a = 0u64;
|
||||||
|
for i in (0..n / 7).map(|val| val * 7) {
|
||||||
|
a += column.get_val(i as u64);
|
||||||
|
}
|
||||||
|
a
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_intfastfield_stride7_fflookup_dense_bitmap_with_offset_70percent_dense(
|
||||||
|
b: &mut Bencher,
|
||||||
|
) {
|
||||||
|
let permutation = generate_permutation();
|
||||||
|
let n = permutation.len();
|
||||||
|
let column: Arc<dyn Column<u64>> = serialize_and_load_dense(&permutation, 700);
|
||||||
|
b.iter(|| {
|
||||||
|
let mut a = 0u64;
|
||||||
|
for i in (0..n / 7).map(|val| val * 7) {
|
||||||
|
a += column.get_val(i as u64);
|
||||||
|
}
|
||||||
|
a
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_intfastfield_stride7_fflookup_dense_bitmap_with_offset_20percent_dense(
|
||||||
|
b: &mut Bencher,
|
||||||
|
) {
|
||||||
|
let permutation = generate_permutation();
|
||||||
|
let n = permutation.len();
|
||||||
|
let column: Arc<dyn Column<u64>> = serialize_and_load_dense(&permutation, 200);
|
||||||
|
b.iter(|| {
|
||||||
|
let mut a = 0u64;
|
||||||
|
for i in (0..n / 7).map(|val| val * 7) {
|
||||||
|
a += column.get_val(i as u64);
|
||||||
|
}
|
||||||
|
a
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
|
fn bench_intfastfield_scan_all_fflookup(b: &mut Bencher) {
|
||||||
let permutation = generate_permutation();
|
let permutation = generate_permutation();
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ mod compact_space;
|
|||||||
mod line;
|
mod line;
|
||||||
mod linear;
|
mod linear;
|
||||||
mod monotonic_mapping;
|
mod monotonic_mapping;
|
||||||
|
mod sparse_codec_wrapper;
|
||||||
|
|
||||||
mod column;
|
mod column;
|
||||||
mod gcd;
|
mod gcd;
|
||||||
@@ -35,6 +36,8 @@ pub use self::monotonic_mapping::MonotonicallyMappableToU64;
|
|||||||
pub use self::serialize::{
|
pub use self::serialize::{
|
||||||
estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
|
estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
|
||||||
};
|
};
|
||||||
|
pub use sparse_codec_wrapper::DenseCodec;
|
||||||
|
pub use sparse_codec_wrapper::SparseCodecRoaringBitmap;
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||||
#[repr(u8)]
|
#[repr(u8)]
|
||||||
@@ -76,6 +79,44 @@ impl FastFieldCodecType {
|
|||||||
pub fn open_u128(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<u128>>> {
|
pub fn open_u128(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<u128>>> {
|
||||||
Ok(Arc::new(CompactSpaceDecompressor::open(bytes)?))
|
Ok(Arc::new(CompactSpaceDecompressor::open(bytes)?))
|
||||||
}
|
}
|
||||||
|
//DenseCodec
|
||||||
|
//
|
||||||
|
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||||
|
pub fn open_dense<T: MonotonicallyMappableToU64>(
|
||||||
|
mut bytes: OwnedBytes,
|
||||||
|
fill_ratio: u32,
|
||||||
|
) -> io::Result<Arc<dyn Column<T>>> {
|
||||||
|
let header = Header::deserialize(&mut bytes)?;
|
||||||
|
match header.codec_type {
|
||||||
|
FastFieldCodecType::Bitpacked => {
|
||||||
|
open_specific_codec_dense::<BitpackedCodec, _>(bytes, &header, fill_ratio)
|
||||||
|
}
|
||||||
|
FastFieldCodecType::Linear => {
|
||||||
|
open_specific_codec_dense::<LinearCodec, _>(bytes, &header, fill_ratio)
|
||||||
|
}
|
||||||
|
FastFieldCodecType::BlockwiseLinear => {
|
||||||
|
open_specific_codec_dense::<BlockwiseLinearCodec, _>(bytes, &header, fill_ratio)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn open_specific_codec_dense<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
|
||||||
|
bytes: OwnedBytes,
|
||||||
|
header: &Header,
|
||||||
|
fill_ratio: u32,
|
||||||
|
) -> io::Result<Arc<dyn Column<Item>>> {
|
||||||
|
let normalized_header = header.normalized();
|
||||||
|
let reader = C::open_from_bytes(bytes, normalized_header)?;
|
||||||
|
let reader = DenseCodec::with_fill_ratio(reader, fill_ratio);
|
||||||
|
let min_value = header.min_value;
|
||||||
|
if let Some(gcd) = header.gcd {
|
||||||
|
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val * gcd.get());
|
||||||
|
Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping)))
|
||||||
|
} else {
|
||||||
|
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val);
|
||||||
|
Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||||
pub fn open<T: MonotonicallyMappableToU64>(
|
pub fn open<T: MonotonicallyMappableToU64>(
|
||||||
@@ -440,6 +481,7 @@ mod bench {
|
|||||||
let data: Vec<_> = get_data();
|
let data: Vec<_> = get_data();
|
||||||
bench_get::<BitpackedCodec>(b, &data);
|
bench_get::<BitpackedCodec>(b, &data);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
|
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
|
||||||
let data: Vec<_> = get_data();
|
let data: Vec<_> = get_data();
|
||||||
|
|||||||
157
fastfield_codecs/src/sparse_codec_wrapper.rs
Normal file
157
fastfield_codecs/src/sparse_codec_wrapper.rs
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use rand::{thread_rng, Rng};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::Column;
|
||||||
|
|
||||||
|
pub struct SparseCodecRoaringBitmap {
|
||||||
|
null: RoaringBitmap,
|
||||||
|
column: Arc<dyn Column<u64>>, // column: C,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SparseCodecRoaringBitmap {
|
||||||
|
pub fn with_full(column: Arc<dyn Column<u64>>) -> Self {
|
||||||
|
let mut rb = RoaringBitmap::new();
|
||||||
|
rb.insert_range(0..column.num_vals() as u32 + 1);
|
||||||
|
Self { null: rb, column }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Column for SparseCodecRoaringBitmap {
|
||||||
|
fn get_val(&self, idx: u64) -> u64 {
|
||||||
|
let position_of_val = self.null.rank(idx as u32);
|
||||||
|
self.column.get_val(position_of_val) // TODO this does not handle null!
|
||||||
|
// self.null.select(num_vals)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn min_value(&self) -> u64 {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn max_value(&self) -> u64 {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn num_vals(&self) -> u64 {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DenseCodec<C> {
|
||||||
|
// the bitmap blocks of length 64 bit each
|
||||||
|
blocks: Vec<u64>,
|
||||||
|
// the offset for each block
|
||||||
|
offsets: Vec<u32>,
|
||||||
|
column: C, // column: C,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<C: Column> DenseCodec<C> {
|
||||||
|
// fill ratio valid range 0..1000 1000 == all elements, 1 == every 1000th element
|
||||||
|
pub fn with_fill_ratio(column: C, fill_ratio: u32) -> Self {
|
||||||
|
let mut rng = thread_rng();
|
||||||
|
let num_blocks = (column.num_vals() as usize / 64) + 1;
|
||||||
|
let mut blocks = Vec::with_capacity(num_blocks);
|
||||||
|
let mut offsets = Vec::with_capacity(num_blocks);
|
||||||
|
// fill all blocks
|
||||||
|
let mut offset = 0;
|
||||||
|
for _block_num in 0..num_blocks {
|
||||||
|
let mut block = 0;
|
||||||
|
for n in 0..64 {
|
||||||
|
if rng.gen_range(0..=1000) <= fill_ratio {
|
||||||
|
set_bit_at(&mut block, n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
blocks.push(block);
|
||||||
|
offsets.push(offset);
|
||||||
|
offset += block.count_ones();
|
||||||
|
}
|
||||||
|
|
||||||
|
Self {
|
||||||
|
blocks,
|
||||||
|
offsets,
|
||||||
|
column,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_full(column: C) -> Self {
|
||||||
|
let num_blocks = (column.num_vals() as usize / 64) + 1;
|
||||||
|
let mut blocks = Vec::with_capacity(num_blocks);
|
||||||
|
let mut offsets = Vec::with_capacity(num_blocks);
|
||||||
|
// fill all blocks
|
||||||
|
let mut offset = 0;
|
||||||
|
for _block_num in 0..num_blocks {
|
||||||
|
let block = u64::MAX;
|
||||||
|
blocks.push(block);
|
||||||
|
offsets.push(offset);
|
||||||
|
offset += block.count_ones();
|
||||||
|
}
|
||||||
|
|
||||||
|
Self {
|
||||||
|
blocks,
|
||||||
|
offsets,
|
||||||
|
column,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn gen_mask(msb: u64) -> u64 {
|
||||||
|
let src = 1 << msb;
|
||||||
|
src - 1
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_bit_at(input: u64, n: u64) -> bool {
|
||||||
|
input & (1 << n) != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_bit_at(input: &mut u64, n: u64) {
|
||||||
|
*input |= 1 << n;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<C: Column> Column for DenseCodec<C> {
|
||||||
|
fn get_val(&self, idx: u64) -> u64 {
|
||||||
|
let block_pos = idx / 64;
|
||||||
|
let pos_in_block = idx % 64;
|
||||||
|
let offset = self.offsets[block_pos as usize];
|
||||||
|
let bitvec = self.blocks[block_pos as usize];
|
||||||
|
let offset_in_block = (bitvec & gen_mask(pos_in_block)).count_ones();
|
||||||
|
let dense_idx = offset as u64 + offset_in_block as u64;
|
||||||
|
if get_bit_at(bitvec, pos_in_block) {
|
||||||
|
self.column.get_val(dense_idx)
|
||||||
|
} else {
|
||||||
|
0 // TODO null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn min_value(&self) -> u64 {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn max_value(&self) -> u64 {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn num_vals(&self) -> u64 {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
//use itertools::Itertools;
|
||||||
|
|
||||||
|
//use super::*;
|
||||||
|
//use crate::serialize_and_load;
|
||||||
|
|
||||||
|
//#[test]
|
||||||
|
//fn dense_test() {
|
||||||
|
//let data = (0..100u64).collect_vec();
|
||||||
|
//{
|
||||||
|
//let column = serialize_and_load(&data);
|
||||||
|
//let dense = DenseCodec::with_full(column);
|
||||||
|
//for i in 0..100 {
|
||||||
|
//dense.get_val(i);
|
||||||
|
//}
|
||||||
|
//}
|
||||||
|
//}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user