From 93cbd52bf06002d626b11a2ea3ad1d6e55723298 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sat, 18 Sep 2021 17:35:22 +0800 Subject: [PATCH] move code to biset, add inline, add benchmark --- common/src/bitset.rs | 57 +++++++++++----- src/core/segment_reader.rs | 2 +- src/fastfield/delete.rs | 126 +++++++++++++++++++++--------------- src/indexer/index_writer.rs | 2 +- 4 files changed, 118 insertions(+), 69 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index f8bbb0fd5..df4dda632 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -16,6 +16,7 @@ pub struct TinySetIterator(TinySet); impl Iterator for TinySetIterator { type Item = u32; + #[inline] fn next(&mut self) -> Option { self.0.pop_lowest() } @@ -34,6 +35,7 @@ impl TinySet { writer.write_all(self.0.to_le_bytes().as_ref()) } + #[inline] pub fn deserialize(data: [u8; 8]) -> io::Result { let val: u64 = u64::from_le_bytes(data); Ok(TinySet(val)) @@ -48,21 +50,25 @@ impl TinySet { self.0 = 0u64; } + #[inline] /// Returns the complement of the set in `[0, 64[`. fn complement(self) -> TinySet { TinySet(!self.0) } + #[inline] /// Returns true iff the `TinySet` contains the element `el`. pub fn contains(self, el: u32) -> bool { !self.intersect(TinySet::singleton(el)).is_empty() } + #[inline] /// Returns the number of elements in the TinySet. pub fn len(self) -> u32 { self.0.count_ones() } + #[inline] /// Returns the intersection of `self` and `other` pub fn intersect(self, other: TinySet) -> TinySet { TinySet(self.0 & other.0) @@ -146,8 +152,7 @@ impl BitSet { /// Write a `BitSet` /// pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { - //writer.write_all(self.len.to_le_bytes().as_ref())?; - //writer.write_all(self.max_value.to_le_bytes().as_ref())?; + writer.write_all(self.max_value.to_le_bytes().as_ref())?; for tinyset in self.tinysets.iter() { tinyset.serialize(writer)?; @@ -158,12 +163,9 @@ impl BitSet { /// Deserialize a `BitSet`. BitSet is considered immutable after deserialization. /// - pub fn deserialize(data: &[u8]) -> io::Result { - //let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); - //data = &data[8..]; - - //let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); - //data = &data[4..]; + pub fn deserialize(mut data: &[u8]) -> io::Result { + let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); + data = &data[4..]; let mut tinysets = vec![]; for chunk in data.chunks_exact(8) { @@ -173,21 +175,35 @@ impl BitSet { Ok(BitSet { tinysets: tinysets.into_boxed_slice(), len: 0, - max_value: 0, + max_value, }) } - /// Iterate over the positions of the set elements + /// Iterate the tinyset on the fly from serialized data. + /// #[inline] - pub fn iter_positions_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { - data.chunks_exact(8) + pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { + data[4..].chunks_exact(8).map(move |chunk| { + let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + tinyset + }) + } + + /// Iterate over the positions of the unset elements. + /// + /// max_val needs to be provided, since the last 64 bits may + #[inline] + pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { + let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); + Self::iter_from_bytes(data) + .map(|tinyset| tinyset.complement()) .enumerate() - .filter(|(_, tinyset)| !tinyset.is_empty()) - .flat_map(|(chunk_num, chunk)| { - let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + .flat_map(move |(chunk_num, tinyset)| { + let chunk_base_val = chunk_num as u32 * 64; tinyset .into_iter() - .map(move |val| val + chunk_num as u32 * 64) + .map(move |val| val + chunk_base_val) + .take_while(move |doc| *doc < max_val) }) } @@ -227,6 +243,15 @@ impl BitSet { }; } + /// Returns true iff the elements is in the `BitSet`. + #[inline] + pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool { + let byte_offset = 4 + el / 8u32; + let b: u8 = data[byte_offset as usize]; + let shift = (el & 7u32) as u8; + b & (1u8 << shift) != 0 + } + /// Returns true iff the elements is in the `BitSet`. pub fn contains(&self, el: u32) -> bool { self.tinyset(el / 64u32).contains(el % 64) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index c80471663..5504f8c60 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -289,7 +289,7 @@ impl SegmentReader { /// Returns an iterator that will iterate over the alive document ids pub fn doc_ids_alive(&self) -> Box + '_> { if let Some(delete_bitset) = &self.delete_bitset_opt { - Box::new(delete_bitset.iter_positions()) + Box::new(delete_bitset.iter_unset()) } else { Box::new(0u32..self.max_doc) } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index bc1b286a1..22af3a68b 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -3,8 +3,6 @@ use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; -use common::TinySet; -use std::convert::TryInto; use std::io; use std::io::Write; @@ -13,29 +11,8 @@ use std::io::Write; /// where `delete_bitset` is the set of deleted `DocId`. /// Warning: this function does not call terminate. The caller is in charge of /// closing the writer properly. -pub fn write_delete_bitset( - delete_bitset: &BitSet, - max_doc: u32, - writer: &mut dyn Write, -) -> io::Result<()> { +pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut dyn Write) -> io::Result<()> { delete_bitset.serialize(writer)?; - //let mut byte = 0u8; - //let mut shift = 0u8; - //for doc in 0..max_doc { - //if delete_bitset.contains(doc) { - //byte |= 1 << shift; - //} - //if shift == 7 { - //writer.write_all(&[byte])?; - //shift = 0; - //byte = 0; - //} else { - //shift += 1; - //} - //} - //if max_doc % 8 > 0 { - //writer.write_all(&[byte])?; - //} Ok(()) } @@ -59,7 +36,7 @@ impl DeleteBitSet { let directory = RamDirectory::create(); let path = Path::new("dummydeletebitset"); let mut wrt = directory.open_write(path).unwrap(); - write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap(); + write_delete_bitset(&bitset, &mut wrt).unwrap(); wrt.terminate().unwrap(); let file = directory.open_read(path).unwrap(); Self::open(file).unwrap() @@ -68,12 +45,8 @@ impl DeleteBitSet { /// Opens a delete bitset given its file. pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let num_deleted = bytes - .chunks_exact(8) - .map(|chunk| { - let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); - tinyset.len() as usize - }) + let num_deleted = BitSet::iter_from_bytes(bytes.as_slice()) + .map(|tinyset| tinyset.len() as usize) .sum(); Ok(DeleteBitSet { @@ -91,17 +64,15 @@ impl DeleteBitSet { /// Returns true iff the document has been marked as deleted. #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { - let byte_offset = doc / 8u32; - let b: u8 = self.data.as_slice()[byte_offset as usize]; - let shift = (doc & 7u32) as u8; - b & (1u8 << shift) != 0 + let data = self.data.as_slice(); + BitSet::contains_from_bytes(doc, data) } /// Iterate over the positions of the set elements #[inline] - pub fn iter_positions(&self) -> impl Iterator + '_ { + pub fn iter_unset(&self) -> impl Iterator + '_ { let data = self.data.as_slice(); - BitSet::iter_positions_from_bytes(data) + BitSet::iter_unset_from_bytes(data) } /// The number of deleted docs @@ -152,29 +123,82 @@ mod tests { fn test_delete_bitset_iter_minimal() { let delete_bitset = DeleteBitSet::for_test(&[7], 8); - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![7]); + let data: Vec<_> = delete_bitset.iter_unset().collect(); + assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); } #[test] fn test_delete_bitset_iter_small() { let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7); - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![0, 2, 3, 6]); + let data: Vec<_> = delete_bitset.iter_unset().collect(); + assert_eq!(data, vec![1, 4, 5]); } #[test] fn test_delete_bitset_iter() { - let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100], 110); + let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000], 1001); - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100]); - } - #[test] - fn test_delete_bitset_iter_empty_blocks() { - let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100, 1000], 1010); - - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100, 1000]); + let data: Vec<_> = delete_bitset.iter_unset().collect(); + assert_eq!(data, (2..=999).collect::>()); + } +} + +#[cfg(all(test, feature = "unstable"))] +mod bench { + + use super::DeleteBitSet; + use common::BitSet; + use rand::prelude::IteratorRandom; + use rand::prelude::SliceRandom; + use rand::thread_rng; + use test::Bencher; + + fn get_many_deleted() -> Vec { + let mut data = (0..1_000_000_u32).collect::>(); + for _ in 0..(1_000_000) * 7 / 8 { + remove_rand(&mut data); + } + data + } + + fn remove_rand(raw: &mut Vec) { + let i = (0..raw.len()).choose(&mut thread_rng()).unwrap(); + raw.remove(i); + } + + #[bench] + fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + + bench.iter(|| delete_bitset.iter_unset().collect::>()); + } + + #[bench] + fn bench_deletebitset_access(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + + bench.iter(|| { + (0..1_000_000_u32) + .filter(|doc| delete_bitset.is_alive(*doc)) + .collect::>() + }); + } + + #[bench] + fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000); + + bench.iter(|| delete_bitset.iter_unset().collect::>()); + } + + #[bench] + fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000); + + bench.iter(|| { + (0..1_000_000_u32) + .filter(|doc| delete_bitset.is_alive(*doc)) + .collect::>() + }); } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 30dd7f4f1..c42b87080 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -180,7 +180,7 @@ pub(crate) fn advance_deletes( // There are new deletes. We need to write a new delete file. segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp); let mut delete_file = segment.open_write(SegmentComponent::Delete)?; - write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?; + write_delete_bitset(&delete_bitset, &mut delete_file)?; delete_file.terminate()?; }