use std::io; use std::io::Write; use common::{intersect_bitsets, BitSet, ReadOnlyBitSet}; use ownedbytes::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; /// Write a alive `BitSet` /// /// where `alive_bitset` is the set of alive `DocId`. /// Warning: this function does not call terminate. The caller is in charge of /// closing the writer properly. pub fn write_alive_bitset(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> { alive_bitset.serialize(writer)?; Ok(()) } /// Set of alive `DocId`s. #[derive(Clone)] pub struct AliveBitSet { num_alive_docs: usize, bitset: ReadOnlyBitSet, } /// Intersects two AliveBitSets in a new one. /// The two bitsets need to have the same max_value. pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet { assert_eq!(left.bitset().max_value(), right.bitset().max_value()); let bitset = intersect_bitsets(left.bitset(), right.bitset()); let num_alive_docs = bitset.len(); AliveBitSet { num_alive_docs, bitset, } } impl AliveBitSet { #[cfg(test)] pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet { assert!(deleted_docs.iter().all(|&doc| doc < max_doc)); let mut bitset = BitSet::with_max_value_and_full(max_doc); for &doc in deleted_docs { bitset.remove(doc); } let mut alive_bitset_buffer = Vec::new(); write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap(); let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer); Self::open(alive_bitset_bytes) } pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet { let readonly_bitset = ReadOnlyBitSet::from(bitset); AliveBitSet::from(readonly_bitset) } /// Opens a delete bitset given its file. pub fn open(bytes: OwnedBytes) -> AliveBitSet { let bitset = ReadOnlyBitSet::open(bytes); AliveBitSet::from(bitset) } /// Returns true iff the document is still "alive". In other words, if it has not been deleted. #[inline] pub fn is_alive(&self, doc: DocId) -> bool { self.bitset.contains(doc) } /// Returns true iff the document has been marked as deleted. #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { !self.is_alive(doc) } /// Iterate over the alive doc_ids. #[inline] pub fn iter_alive(&self) -> impl Iterator + '_ { self.bitset.iter() } /// Get underlying bitset #[inline] pub fn bitset(&self) -> &ReadOnlyBitSet { &self.bitset } /// The number of deleted docs pub fn num_alive_docs(&self) -> usize { self.num_alive_docs } /// Summarize total space usage of this bitset. pub fn space_usage(&self) -> ByteCount { self.bitset().num_bytes() } } impl From for AliveBitSet { fn from(bitset: ReadOnlyBitSet) -> AliveBitSet { let num_alive_docs = bitset.len(); AliveBitSet { num_alive_docs, bitset, } } } #[cfg(test)] mod tests { use super::AliveBitSet; #[test] fn test_alive_bitset_empty() { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10); for doc in 0..10 { assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc)); assert!(!alive_bitset.is_deleted(doc)); } assert_eq!(alive_bitset.num_alive_docs(), 10); } #[test] fn test_alive_bitset() { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10); assert!(alive_bitset.is_alive(0)); assert!(alive_bitset.is_deleted(1)); assert!(alive_bitset.is_alive(2)); assert!(alive_bitset.is_alive(3)); assert!(alive_bitset.is_alive(4)); assert!(alive_bitset.is_alive(5)); assert!(alive_bitset.is_alive(6)); assert!(alive_bitset.is_alive(6)); assert!(alive_bitset.is_alive(7)); assert!(alive_bitset.is_alive(8)); assert!(alive_bitset.is_deleted(9)); for doc in 0..10 { assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc)); } assert_eq!(alive_bitset.num_alive_docs(), 8); } #[test] fn test_alive_bitset_iter_minimal() { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8); let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); } #[test] fn test_alive_bitset_iter_small() { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7); let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![1, 4, 5]); } #[test] fn test_alive_bitset_iter() { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001); let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, (2..=999).collect::>()); } } #[cfg(all(test, feature = "unstable"))] mod bench { use rand::prelude::IteratorRandom; use rand::thread_rng; use test::Bencher; use super::AliveBitSet; fn get_alive() -> Vec { let mut data = (0..1_000_000_u32).collect::>(); for _ in 0..(1_000_000) * 1 / 8 { remove_rand(&mut data); } data } fn remove_rand(raw: &mut Vec) { let i = (0..raw.len()).choose(&mut thread_rng()).unwrap(); raw.remove(i); } #[bench] fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000); bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] fn bench_deletebitset_access(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000); bench.iter(|| { (0..1_000_000_u32) .filter(|doc| alive_bitset.is_alive(*doc)) .collect::>() }); } #[bench] fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000); bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000); bench.iter(|| { (0..1_000_000_u32) .filter(|doc| alive_bitset.is_alive(*doc)) .collect::>() }); } }