mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
Improvement on the scalar / random bitpacker code. (#1781)
* Improvement on the scalar / random bitpacker code. Added proptesting Added simple benchmark Added assert and comments on the very non trivial hidden contract Remove the need for an extra padding. The last point introduces a small performance regression (~10%). * Fixing unit tests
This commit is contained in:
@@ -15,3 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
proptest = "1"
|
||||
|
||||
@@ -4,9 +4,42 @@ extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::seq::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
use test::Bencher;
|
||||
|
||||
|
||||
#[inline(never)]
|
||||
fn create_bitpacked_data(bit_width: u8, num_els: u32) -> Vec<u8> {
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let mut buffer = Vec::new();
|
||||
for _ in 0..num_els {
|
||||
// the values do not matter.
|
||||
bitpacker.write(0u64, bit_width, &mut buffer).unwrap();
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
}
|
||||
buffer
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitpacking_read(b: &mut Bencher) {
|
||||
let bit_width = 3;
|
||||
let num_els = 1_000_000u32;
|
||||
let bit_unpacker = BitUnpacker::new(bit_width);
|
||||
let data = create_bitpacked_data(bit_width, num_els);
|
||||
let idxs: Vec<u32> = (0..num_els).choose_multiple(&mut thread_rng(), 100_000);
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for &idx in &idxs {
|
||||
out = out.wrapping_add(bit_unpacker.get(idx, &data[..]));
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
@@ -14,9 +47,9 @@ mod tests {
|
||||
blocked_bitpacker.add(val * val);
|
||||
}
|
||||
b.iter(|| {
|
||||
let mut out = 0;
|
||||
let mut out = 0u64;
|
||||
for val in 0..=21500 {
|
||||
out = blocked_bitpacker.get(val);
|
||||
out = out.wrapping_add(blocked_bitpacker.get(val));
|
||||
}
|
||||
out
|
||||
});
|
||||
|
||||
@@ -56,27 +56,31 @@ impl BitPacker {
|
||||
|
||||
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
self.flush(output)?;
|
||||
// Padding the write file to simplify reads.
|
||||
output.write_all(&[0u8; 7])?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
#[derive(Clone, Debug, Default, Copy)]
|
||||
pub struct BitUnpacker {
|
||||
num_bits: u64,
|
||||
num_bits: u32,
|
||||
mask: u64,
|
||||
}
|
||||
|
||||
impl BitUnpacker {
|
||||
/// Creates a bit unpacker, that assumes the same bitwidth for all values.
|
||||
///
|
||||
/// The bitunpacker works by doing an unaligned read of 8 bytes.
|
||||
/// For this reason, values of `num_bits` between
|
||||
/// [57..63] are forbidden.
|
||||
pub fn new(num_bits: u8) -> BitUnpacker {
|
||||
assert!(num_bits <= 7 * 8 || num_bits == 64);
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: u64::from(num_bits),
|
||||
num_bits: u32::from(num_bits),
|
||||
mask,
|
||||
}
|
||||
}
|
||||
@@ -87,28 +91,40 @@ impl BitUnpacker {
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
}
|
||||
let addr_in_bits = idx * self.num_bits as u32;
|
||||
let addr_in_bits = idx * self.num_bits;
|
||||
let addr = (addr_in_bits >> 3) as usize;
|
||||
if addr + 8 > data.len() {
|
||||
if self.num_bits == 0 {
|
||||
return 0;
|
||||
}
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
return self.get_slow_path(addr, bit_shift, data);
|
||||
}
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = val_unshifted_unmasked >> bit_shift;
|
||||
val_shifted & self.mask
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn get_slow_path(&self, addr: usize, bit_shift: u32, data: &[u8]) -> u64 {
|
||||
let mut bytes: [u8; 8] = [0u8; 8];
|
||||
let available_bytes = data.len() - addr;
|
||||
// This function is meant to only be called if we did not have 8 bytes to load.
|
||||
debug_assert!(available_bytes < 8);
|
||||
bytes[..available_bytes].copy_from_slice(&data[addr..]);
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = val_unshifted_unmasked >> bit_shift;
|
||||
val_shifted & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker};
|
||||
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
|
||||
fn create_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
|
||||
@@ -125,7 +141,7 @@ mod test {
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
|
||||
let (bitunpacker, vals, data) = create_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i as u32, &data), *val);
|
||||
}
|
||||
@@ -139,4 +155,56 @@ mod test {
|
||||
test_bitpacker_util(6, 14);
|
||||
test_bitpacker_util(1000, 14);
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
fn num_bits_strategy() -> impl Strategy<Value = u8> {
|
||||
prop_oneof!(
|
||||
Just(0),
|
||||
Just(1),
|
||||
2u8..56u8,
|
||||
Just(56),
|
||||
Just(64),
|
||||
)
|
||||
}
|
||||
|
||||
fn vals_strategy() -> impl Strategy<Value = (u8, Vec<u64>)> {
|
||||
(num_bits_strategy(), 0usize..100usize).prop_flat_map(|(num_bits, len)| {
|
||||
let max_val = if num_bits == 64 {
|
||||
u64::MAX
|
||||
} else {
|
||||
(1u64 << num_bits as u32) - 1
|
||||
};
|
||||
let vals = proptest::collection::vec(0..=max_val, len);
|
||||
vals.prop_map(move |vals| (num_bits, vals))
|
||||
})
|
||||
}
|
||||
|
||||
fn test_bitpacker_aux(num_bits: u8, vals: &[u64]) {
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
for &val in vals {
|
||||
bitpacker.write(val, num_bits, &mut buffer).unwrap();
|
||||
}
|
||||
bitpacker.flush(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), (vals.len() * num_bits as usize + 7) / 8);
|
||||
let bitunpacker = BitUnpacker::new(num_bits);
|
||||
let max_val =
|
||||
if num_bits == 64 {
|
||||
u64::MAX
|
||||
} else {
|
||||
(1u64 << num_bits) - 1
|
||||
};
|
||||
for (i, val) in vals.iter().copied().enumerate() {
|
||||
assert!(val <= max_val);
|
||||
assert_eq!(bitunpacker.get(i as u32, &buffer), val);
|
||||
}
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_bitpacker_proptest((num_bits, vals) in vals_strategy()) {
|
||||
test_bitpacker_aux(num_bits, &vals);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user