move code to biset, add inline, add benchmark

This commit is contained in:
Pascal Seitz
2021-09-18 17:35:22 +08:00
parent c22177a005
commit 93cbd52bf0
4 changed files with 118 additions and 69 deletions

View File

@@ -16,6 +16,7 @@ pub struct TinySetIterator(TinySet);
impl Iterator for TinySetIterator {
type Item = u32;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
self.0.pop_lowest()
}
@@ -34,6 +35,7 @@ impl TinySet {
writer.write_all(self.0.to_le_bytes().as_ref())
}
#[inline]
pub fn deserialize(data: [u8; 8]) -> io::Result<Self> {
let val: u64 = u64::from_le_bytes(data);
Ok(TinySet(val))
@@ -48,21 +50,25 @@ impl TinySet {
self.0 = 0u64;
}
#[inline]
/// Returns the complement of the set in `[0, 64[`.
fn complement(self) -> TinySet {
TinySet(!self.0)
}
#[inline]
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
}
#[inline]
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 {
self.0.count_ones()
}
#[inline]
/// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
@@ -146,8 +152,7 @@ impl BitSet {
/// Write a `BitSet`
///
pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> {
//writer.write_all(self.len.to_le_bytes().as_ref())?;
//writer.write_all(self.max_value.to_le_bytes().as_ref())?;
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
for tinyset in self.tinysets.iter() {
tinyset.serialize(writer)?;
@@ -158,12 +163,9 @@ impl BitSet {
/// Deserialize a `BitSet`. BitSet is considered immutable after deserialization.
///
pub fn deserialize(data: &[u8]) -> io::Result<Self> {
//let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap());
//data = &data[8..];
//let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
//data = &data[4..];
pub fn deserialize(mut data: &[u8]) -> io::Result<Self> {
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
data = &data[4..];
let mut tinysets = vec![];
for chunk in data.chunks_exact(8) {
@@ -173,21 +175,35 @@ impl BitSet {
Ok(BitSet {
tinysets: tinysets.into_boxed_slice(),
len: 0,
max_value: 0,
max_value,
})
}
/// Iterate over the positions of the set elements
/// Iterate the tinyset on the fly from serialized data.
///
#[inline]
pub fn iter_positions_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
data.chunks_exact(8)
pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = TinySet> + 'a {
data[4..].chunks_exact(8).map(move |chunk| {
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
tinyset
})
}
/// Iterate over the positions of the unset elements.
///
/// max_val needs to be provided, since the last 64 bits may
#[inline]
pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator<Item = u32> + 'a {
let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
Self::iter_from_bytes(data)
.map(|tinyset| tinyset.complement())
.enumerate()
.filter(|(_, tinyset)| !tinyset.is_empty())
.flat_map(|(chunk_num, chunk)| {
let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
.flat_map(move |(chunk_num, tinyset)| {
let chunk_base_val = chunk_num as u32 * 64;
tinyset
.into_iter()
.map(move |val| val + chunk_num as u32 * 64)
.map(move |val| val + chunk_base_val)
.take_while(move |doc| *doc < max_val)
})
}
@@ -227,6 +243,15 @@ impl BitSet {
};
}
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool {
let byte_offset = 4 + el / 8u32;
let b: u8 = data[byte_offset as usize];
let shift = (el & 7u32) as u8;
b & (1u8 << shift) != 0
}
/// Returns true iff the elements is in the `BitSet`.
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64)

View File

@@ -289,7 +289,7 @@ impl SegmentReader {
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
if let Some(delete_bitset) = &self.delete_bitset_opt {
Box::new(delete_bitset.iter_positions())
Box::new(delete_bitset.iter_unset())
} else {
Box::new(0u32..self.max_doc)
}

View File

@@ -3,8 +3,6 @@ use crate::directory::OwnedBytes;
use crate::space_usage::ByteCount;
use crate::DocId;
use common::BitSet;
use common::TinySet;
use std::convert::TryInto;
use std::io;
use std::io::Write;
@@ -13,29 +11,8 @@ use std::io::Write;
/// where `delete_bitset` is the set of deleted `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_delete_bitset(
delete_bitset: &BitSet,
max_doc: u32,
writer: &mut dyn Write,
) -> io::Result<()> {
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut dyn Write) -> io::Result<()> {
delete_bitset.serialize(writer)?;
//let mut byte = 0u8;
//let mut shift = 0u8;
//for doc in 0..max_doc {
//if delete_bitset.contains(doc) {
//byte |= 1 << shift;
//}
//if shift == 7 {
//writer.write_all(&[byte])?;
//shift = 0;
//byte = 0;
//} else {
//shift += 1;
//}
//}
//if max_doc % 8 > 0 {
//writer.write_all(&[byte])?;
//}
Ok(())
}
@@ -59,7 +36,7 @@ impl DeleteBitSet {
let directory = RamDirectory::create();
let path = Path::new("dummydeletebitset");
let mut wrt = directory.open_write(path).unwrap();
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
write_delete_bitset(&bitset, &mut wrt).unwrap();
wrt.terminate().unwrap();
let file = directory.open_read(path).unwrap();
Self::open(file).unwrap()
@@ -68,12 +45,8 @@ impl DeleteBitSet {
/// Opens a delete bitset given its file.
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
let bytes = file.read_bytes()?;
let num_deleted = bytes
.chunks_exact(8)
.map(|chunk| {
let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
tinyset.len() as usize
})
let num_deleted = BitSet::iter_from_bytes(bytes.as_slice())
.map(|tinyset| tinyset.len() as usize)
.sum();
Ok(DeleteBitSet {
@@ -91,17 +64,15 @@ impl DeleteBitSet {
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
let byte_offset = doc / 8u32;
let b: u8 = self.data.as_slice()[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
let data = self.data.as_slice();
BitSet::contains_from_bytes(doc, data)
}
/// Iterate over the positions of the set elements
#[inline]
pub fn iter_positions(&self) -> impl Iterator<Item = u32> + '_ {
pub fn iter_unset(&self) -> impl Iterator<Item = u32> + '_ {
let data = self.data.as_slice();
BitSet::iter_positions_from_bytes(data)
BitSet::iter_unset_from_bytes(data)
}
/// The number of deleted docs
@@ -152,29 +123,82 @@ mod tests {
fn test_delete_bitset_iter_minimal() {
let delete_bitset = DeleteBitSet::for_test(&[7], 8);
let data: Vec<_> = delete_bitset.iter_positions().collect();
assert_eq!(data, vec![7]);
let data: Vec<_> = delete_bitset.iter_unset().collect();
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
}
#[test]
fn test_delete_bitset_iter_small() {
let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7);
let data: Vec<_> = delete_bitset.iter_positions().collect();
assert_eq!(data, vec![0, 2, 3, 6]);
let data: Vec<_> = delete_bitset.iter_unset().collect();
assert_eq!(data, vec![1, 4, 5]);
}
#[test]
fn test_delete_bitset_iter() {
let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100], 110);
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000], 1001);
let data: Vec<_> = delete_bitset.iter_positions().collect();
assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100]);
}
#[test]
fn test_delete_bitset_iter_empty_blocks() {
let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100, 1000], 1010);
let data: Vec<_> = delete_bitset.iter_positions().collect();
assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100, 1000]);
let data: Vec<_> = delete_bitset.iter_unset().collect();
assert_eq!(data, (2..=999).collect::<Vec<_>>());
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::DeleteBitSet;
use common::BitSet;
use rand::prelude::IteratorRandom;
use rand::prelude::SliceRandom;
use rand::thread_rng;
use test::Bencher;
fn get_many_deleted() -> Vec<u32> {
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
for _ in 0..(1_000_000) * 7 / 8 {
remove_rand(&mut data);
}
data
}
fn remove_rand(raw: &mut Vec<u32>) {
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
raw.remove(i);
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| delete_bitset.iter_unset().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access(bench: &mut Bencher) {
let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| delete_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000);
bench.iter(|| delete_bitset.iter_unset().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| delete_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
}

View File

@@ -180,7 +180,7 @@ pub(crate) fn advance_deletes(
// There are new deletes. We need to write a new delete file.
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
write_delete_bitset(&delete_bitset, &mut delete_file)?;
delete_file.terminate()?;
}