Compare commits

...

1 Commits

Author SHA1 Message Date
Paul Masurel
bebfca21a8 headroom 2022-12-08 18:45:38 +09:00
2 changed files with 66 additions and 65 deletions

View File

@@ -23,75 +23,69 @@ pub struct DenseCodec {
// u32 is the offset of the block, the number of set bits so far. // u32 is the offset of the block, the number of set bits so far.
// //
// At the end one block is appended, to store the number of values in the index in offset. // At the end one block is appended, to store the number of values in the index in offset.
data: OwnedBytes, data: Vec<IndexBlock>,
} }
const ELEMENTS_PER_BLOCK: u32 = 64; const ELEMENTS_PER_BLOCK: u32 = 32;
const BLOCK_BITVEC_SIZE: usize = 8;
const BLOCK_OFFSET_SIZE: usize = 4;
const SERIALIZED_BLOCK_SIZE: usize = BLOCK_BITVEC_SIZE + BLOCK_OFFSET_SIZE;
fn count_ones(block: u64, pos_in_block: u32) -> u32 { #[inline]
if pos_in_block == 63 { fn count_ones(block: u32, pos_in_block: u32) -> u32 {
block.count_ones() unsafe { core::arch::x86_64::_bzhi_u32(block, pos_in_block + 1) }.count_ones()
} else { // if pos_in_block == 31 {
let mask = (1u64 << (pos_in_block + 1)) - 1; // block.count_ones()
let masked_block = block & mask; // } else {
masked_block.count_ones() // let mask = (1u32 << (pos_in_block + 1)) - 1;
} // let masked_block = block & mask;
// masked_block.count_ones()
// }
}
#[derive(Copy, Clone, Debug)]
pub struct IndexBlock {
bitvec: u32,
offset: u32,
} }
impl DenseCodec { impl DenseCodec {
/// Open the DenseCodec from OwnedBytes /// Open the DenseCodec from OwnedBytes
pub fn open(data: OwnedBytes) -> Self { pub fn open(data: Vec<IndexBlock>) -> Self {
Self { data } Self { data }
} }
#[inline] #[inline]
/// Check if value at position is not null. /// Check if value at position is not null.
pub fn exists(&self, idx: u32) -> bool { pub fn exists(&self, idx: u32) -> bool {
let block_pos = idx / ELEMENTS_PER_BLOCK; let block_pos = idx / ELEMENTS_PER_BLOCK;
let bitvec: u64 = self.block(block_pos as usize); let bitvec: u32 = self.block(block_pos);
let pos_in_block = idx % ELEMENTS_PER_BLOCK; let pos_in_block = idx % ELEMENTS_PER_BLOCK;
get_bit_at(bitvec, pos_in_block) get_bit_at(bitvec, pos_in_block)
} }
#[inline]
pub(crate) fn block(&self, block_pos: usize) -> u64 {
let data = &mut &self.data[block_pos as usize * SERIALIZED_BLOCK_SIZE..];
let block: u64 = #[inline]
BinarySerializable::deserialize(data).expect("could not read block in null index"); pub(crate) fn block(&self, block_pos: u32) -> u32 {
block self.block_and_offset(block_pos).bitvec
} }
#[inline] #[inline]
/// Returns (bitvec, offset) /// Returns (bitvec, offset)
/// ///
/// offset is the start offset of actual docids in the block. /// offset is the start offset of actual docids in the block.
pub(crate) fn block_and_offset(&self, block_pos: u32) -> (u64, u32) { pub(crate) fn block_and_offset(&self, block_pos: u32) -> IndexBlock {
let data = &mut &self.data[block_pos as usize * SERIALIZED_BLOCK_SIZE..]; self.data[block_pos as usize]
let block: u64 =
BinarySerializable::deserialize(data).expect("could not read block in null index");
let offset: u32 =
BinarySerializable::deserialize(data).expect("could not read block in null index");
(block, offset)
} }
/// Return the number of non-null values in an index /// Return the number of non-null values in an index
pub fn num_non_null_vals(&self) -> u32 { pub fn num_non_null_vals(&self) -> u32 {
let last_block = (self.data.len() / SERIALIZED_BLOCK_SIZE) - 1; let last_block = self.data.len() - 1;
self.block_and_offset(last_block as u32).1 self.block_and_offset(last_block as u32).offset
} }
#[inline] #[inline]
/// Translate from the original index to the codec index. /// Translate from the original index to the codec index.
pub fn translate_to_codec_idx(&self, idx: u32) -> Option<u32> { pub fn translate_to_codec_idx(&self, idx: u32) -> Option<u32> {
let block_pos = idx / ELEMENTS_PER_BLOCK; let block_pos = idx / ELEMENTS_PER_BLOCK;
let (block, offset) = self.block_and_offset(block_pos); let IndexBlock { bitvec: block, offset } = self.block_and_offset(block_pos);
let pos_in_block = idx % ELEMENTS_PER_BLOCK; let pos_in_block = idx % ELEMENTS_PER_BLOCK;
let ones_in_block = count_ones(block, pos_in_block);
if get_bit_at(block, pos_in_block) { if get_bit_at(block, pos_in_block) {
let ones_in_block = count_ones(block, pos_in_block);
Some(offset + ones_in_block - 1) // -1 is ok, since idx does exist, so there's at least Some(offset + ones_in_block - 1) // -1 is ok, since idx does exist, so there's at least
// one // one
} else { } else {
@@ -112,8 +106,7 @@ impl DenseCodec {
iter.map(move |dense_idx| { iter.map(move |dense_idx| {
// update block_pos to limit search scope // update block_pos to limit search scope
block_pos = find_block(dense_idx, block_pos, &self.data); block_pos = find_block(dense_idx, block_pos, &self.data);
let (bitvec, offset) = self.block_and_offset(block_pos); let IndexBlock { bitvec, offset} = self.block_and_offset(block_pos);
// The next offset is higher than dense_idx and therefore: // The next offset is higher than dense_idx and therefore:
// dense_idx <= offset + num_set_bits in block // dense_idx <= offset + num_set_bits in block
let mut num_set_bits = 0; let mut num_set_bits = 0;
@@ -138,43 +131,36 @@ impl DenseCodec {
/// dense_idx needs to be smaller than the number of values in the index /// dense_idx needs to be smaller than the number of values in the index
/// ///
/// The last offset number is equal to the number of values in the index. /// The last offset number is equal to the number of values in the index.
fn find_block(dense_idx: u32, mut block_pos: u32, data: &[u8]) -> u32 { fn find_block(dense_idx: u32, mut block_pos: u32, data: &[IndexBlock]) -> u32 {
loop { for i in block_pos.. {
let data = &mut &data[BLOCK_BITVEC_SIZE + block_pos as usize * SERIALIZED_BLOCK_SIZE..]; let index_block = &data[i as usize];
let offset: u32 = BinarySerializable::deserialize(data) if index_block.offset > dense_idx {
.expect("could not read offset from block in null index");
if offset > dense_idx {
// offset // offset
return block_pos - 1; return i - 1;
} }
block_pos += 1;
} }
unreachable!()
} }
/// Iterator over all values, true if set, otherwise false /// Iterator over all values, true if set, otherwise false
pub fn serialize_dense_codec( pub fn serialize_dense_codec(
iter: impl Iterator<Item = bool>, iter: impl Iterator<Item = bool>,
mut out: impl Write, out: &mut Vec<IndexBlock>,
) -> io::Result<()> { ) -> io::Result<()> {
let mut offset: u32 = 0; let mut offset: u32 = 0;
for chunk in &iter.chunks(ELEMENTS_PER_BLOCK as usize) { for chunk in &iter.chunks(ELEMENTS_PER_BLOCK as usize) {
let mut block: u64 = 0; let mut bitvec: u32 = 0;
for (pos, is_bit_set) in chunk.enumerate() { for (pos, is_bit_set) in chunk.enumerate() {
if is_bit_set { if is_bit_set {
set_bit_at(&mut block, pos as u64); set_bit_at(&mut bitvec, pos as u32);
} }
} }
out.push(IndexBlock { bitvec, offset});
block.serialize(&mut out)?; offset += bitvec.count_ones() as u32;
offset.serialize(&mut out)?;
offset += block.count_ones() as u32;
} }
// Add sentinal block for the offset // Add sentinal block for the offset
let block: u64 = 0; out.push(IndexBlock { bitvec: 0, offset });
block.serialize(&mut out)?;
offset.serialize(&mut out)?;
Ok(()) Ok(())
} }
@@ -199,6 +185,15 @@ mod tests {
.boxed() .boxed()
} }
#[test]
fn test_with_random_bitvecs_simple() {
let mut bitvec = Vec::new();
bitvec.extend_from_slice(&[]);
bitvec.extend_from_slice(&[]);
bitvec.extend_from_slice(&[true]);
test_null_index(bitvec);
}
proptest! { proptest! {
#![proptest_config(ProptestConfig::with_cases(500))] #![proptest_config(ProptestConfig::with_cases(500))]
#[test] #[test]
@@ -222,7 +217,8 @@ mod tests {
let mut out = vec![]; let mut out = vec![];
serialize_dense_codec(data.iter().cloned(), &mut out).unwrap(); serialize_dense_codec(data.iter().cloned(), &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out)); dbg!(&out);
let null_index = DenseCodec::open(out);
let orig_idx_with_value: Vec<u32> = data let orig_idx_with_value: Vec<u32> = data
.iter() .iter()
@@ -256,7 +252,7 @@ mod tests {
let iter = ([true, false, true, false]).iter().cloned(); let iter = ([true, false, true, false]).iter().cloned();
serialize_dense_codec(iter, &mut out).unwrap(); serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out)); let null_index = DenseCodec::open(out);
assert_eq!( assert_eq!(
null_index null_index
@@ -272,7 +268,7 @@ mod tests {
let iter = ([true, false, true, false]).iter().cloned(); let iter = ([true, false, true, false]).iter().cloned();
serialize_dense_codec(iter, &mut out).unwrap(); serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out)); let null_index = DenseCodec::open(out);
assert_eq!(null_index.translate_to_codec_idx(0), Some(0)); assert_eq!(null_index.translate_to_codec_idx(0), Some(0));
assert_eq!(null_index.translate_to_codec_idx(2), Some(1)); assert_eq!(null_index.translate_to_codec_idx(2), Some(1));
} }
@@ -283,7 +279,7 @@ mod tests {
let iter = ([true, false, true, false]).iter().cloned(); let iter = ([true, false, true, false]).iter().cloned();
serialize_dense_codec(iter, &mut out).unwrap(); serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out)); let null_index = DenseCodec::open(out);
assert!(null_index.exists(0)); assert!(null_index.exists(0));
assert!(!null_index.exists(1)); assert!(!null_index.exists(1));
assert!(null_index.exists(2)); assert!(null_index.exists(2));
@@ -299,7 +295,7 @@ mod tests {
let iter = docs.iter().cloned(); let iter = docs.iter().cloned();
let mut out = vec![]; let mut out = vec![];
serialize_dense_codec(iter, &mut out).unwrap(); serialize_dense_codec(iter, &mut out).unwrap();
let null_index = DenseCodec::open(OwnedBytes::new(out)); let null_index = DenseCodec::open(out);
assert!(!null_index.exists(0)); assert!(!null_index.exists(0));
assert!(!null_index.exists(100)); assert!(!null_index.exists(100));
assert!(!null_index.exists(999)); assert!(!null_index.exists(999));
@@ -324,6 +320,7 @@ mod tests {
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
mod bench { mod bench {
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
use test::Bencher; use test::Bencher;
@@ -337,7 +334,7 @@ mod bench {
let bools: Vec<_> = (0..100_000).map(|_| rng.gen_bool(8f64 / 10f64)).collect(); let bools: Vec<_> = (0..100_000).map(|_| rng.gen_bool(8f64 / 10f64)).collect();
serialize_dense_codec(bools.into_iter(), &mut out).unwrap(); serialize_dense_codec(bools.into_iter(), &mut out).unwrap();
let codec = DenseCodec::open(OwnedBytes::new(out)); let codec = DenseCodec::open(out);
codec codec
} }
@@ -347,7 +344,7 @@ mod bench {
bench.iter(|| { bench.iter(|| {
let mut dense_idx: Option<u32> = None; let mut dense_idx: Option<u32> = None;
for idx in 0..100_000 { for idx in 0..100_000 {
dense_idx = codec.translate_to_codec_idx(idx); dense_idx = dense_idx.or(codec.translate_to_codec_idx(idx));
} }
dense_idx dense_idx
}); });

View File

@@ -2,10 +2,14 @@ pub use dense::{serialize_dense_codec, DenseCodec};
mod dense; mod dense;
fn get_bit_at(input: u64, n: u32) -> bool {
#[inline(always)]
fn get_bit_at(input: u32, n: u32) -> bool {
input & (1 << n) != 0 input & (1 << n) != 0
} }
fn set_bit_at(input: &mut u64, n: u64) { #[inline(always)]
fn set_bit_at(input: &mut u32, n: u32) {
*input |= 1 << n; *input |= 1 << n;
} }