diff --git a/common/Cargo.toml b/common/Cargo.toml index 94b40a459..1a6703c1e 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -10,6 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc [dependencies] byteorder = "1.4.3" +ownedbytes = { version="0.1", path="../ownedbytes" } [dev-dependencies] proptest = "1.0.0" diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 84c936a61..a05f786ef 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -1,3 +1,4 @@ +use ownedbytes::OwnedBytes; use std::convert::TryInto; use std::io::Write; use std::u64; @@ -64,34 +65,28 @@ impl TinySet { } #[inline] - /// Returns true iff the `TinySet` bit is set at position `pos`. - pub fn contains(self, pos: u32) -> bool { - !self.intersect(TinySet::singleton(pos)).is_empty() + /// Returns true iff the `TinySet` contains the element `el`. + pub fn contains(self, el: u32) -> bool { + !self.intersect(TinySet::singleton(el)).is_empty() } #[inline] - /// Returns the number of set bits in the TinySet. - pub fn num_set(self) -> u32 { + /// Returns the number of elements in the TinySet. + pub fn len(self) -> u32 { self.0.count_ones() } - #[inline] - /// Returns the number of unset bits in the TinySet. - pub fn num_unset(self) -> u32 { - self.0.count_zeros() - } - #[inline] /// Returns the intersection of `self` and `other` pub fn intersect(self, other: TinySet) -> TinySet { TinySet(self.0 & other.0) } - /// Creates a new `TinySet` with only one bit set at `pos`. + /// Creates a new `TinySet` containing only one element /// within `[0; 64[` #[inline] - pub fn singleton(pos: u32) -> TinySet { - TinySet(1u64 << u64::from(pos)) + pub fn singleton(el: u32) -> TinySet { + TinySet(1u64 << u64::from(el)) } /// Insert a new element within [0..64) @@ -108,7 +103,7 @@ impl TinySet { /// Insert a new element within [0..64) /// - /// returns true if the bit changed + /// returns true if the set changed #[inline] pub fn insert_mut(&mut self, el: u32) -> bool { let old = *self; @@ -116,9 +111,9 @@ impl TinySet { old != *self } - /// Remove a new element within [0..64) + /// Remove a element within [0..64) /// - /// returns true if the bit changed + /// returns true if the set changed #[inline] pub fn remove_mut(&mut self, el: u32) -> bool { let old = *self; @@ -203,7 +198,7 @@ impl BitSet { let mut tinysets = vec![]; for chunk in data.chunks_exact(8) { let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?; - len += tinyset.num_set() as u64; + len += tinyset.len() as u64; tinysets.push(tinyset); } Ok(BitSet { @@ -213,43 +208,6 @@ impl BitSet { }) } - /// Count the number of unset bits from serialized data. - /// - #[inline] - pub fn count_unset_from_bytes<'a>(data: &'a [u8]) -> usize { - BitSet::iter_tinysets_from_bytes(data) - .map(|tinyset| tinyset.num_unset() as usize) - .sum() - } - - /// Iterate the tinyset on the fly from serialized data. - /// - #[inline] - fn iter_tinysets_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { - assert!((data.len() - 4) % 8 == 0); - data[4..].chunks_exact(8).map(move |chunk| { - let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); - tinyset - }) - } - - /// Iterate over the positions of the unset elements. - /// - /// max_val needs to be provided, since the last 64 bits may - #[inline] - pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { - let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); - Self::iter_tinysets_from_bytes(data) - .enumerate() - .flat_map(move |(chunk_num, tinyset)| { - let chunk_base_val = chunk_num as u32 * 64; - tinyset - .into_iter() - .map(move |val| val + chunk_base_val) - .take_while(move |doc| *doc < max_val) - }) - } - /// Create a new `BitSet` that may contain elements /// within `[0, max_val)`. pub fn with_max_value(max_value: u32) -> BitSet { @@ -262,9 +220,9 @@ impl BitSet { } } - /// Create a new `BitSet` that may contain elements + /// Create a new `BitSet` that may contain elements. Initially all values will be set. /// within `[0, max_val)`. - pub fn with_max_value_and_filled(max_value: u32) -> BitSet { + pub fn with_max_value_and_full(max_value: u32) -> BitSet { let num_buckets = num_buckets(max_value); let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice(); BitSet { @@ -282,11 +240,12 @@ impl BitSet { } /// Returns the number of elements in the `BitSet`. - pub fn num_set_bits(&self) -> usize { + pub fn len(&self) -> usize { self.len as usize } /// Inserts an element in the `BitSet` + #[inline] pub fn insert(&mut self, el: u32) { // we do not check saturated els. let higher = el / 64u32; @@ -299,6 +258,7 @@ impl BitSet { } /// Inserts an element in the `BitSet` + #[inline] pub fn remove(&mut self, el: u32) { // we do not check saturated els. let higher = el / 64u32; @@ -312,14 +272,6 @@ impl BitSet { /// Returns true iff the elements is in the `BitSet`. #[inline] - pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool { - let byte_offset = 4 + el / 8u32; - let b: u8 = data[byte_offset as usize]; - let shift = (el % 8) as u8; - b & (1u8 << shift) != 0 - } - - /// Returns true iff the elements is in the `BitSet`. pub fn contains(&self, el: u32) -> bool { self.tinyset(el / 64u32).contains(el % 64) } @@ -349,17 +301,133 @@ impl BitSet { } } +/// Lazy Read a serialized BitSet. +#[derive(Clone)] +pub struct ReadSerializedBitSet { + data: OwnedBytes, + max_value: u32, +} + +impl ReadSerializedBitSet { + pub fn new(data: OwnedBytes) -> Self { + let (max_value_data, data) = data.split(4); + let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap()); + ReadSerializedBitSet { data, max_value } + } + + /// Count the number of unset bits from serialized data. + /// + #[inline] + pub fn count_unset(&self) -> usize { + let lower = self.max_value % 64u32; + + let num_set: usize = self + .iter_tinysets() + .map(|(tinyset, is_last)| { + if is_last { + tinyset.intersect(TinySet::range_lower(lower)).len() as usize + } else { + tinyset.len() as usize + } + }) + .sum(); + self.max_value as usize - num_set + } + + /// Iterate the tinyset on the fly from serialized data. + /// + /// Iterator returns (TinySet, is_last) element, so the consumer can ignore up to max_doc in the + /// last block. + /// + #[inline] + fn iter_tinysets<'a>(&'a self) -> impl Iterator + 'a { + assert!((self.data.len()) % 8 == 0); + self.data + .chunks_exact(8) + .enumerate() + .map(move |(chunk_num, chunk)| { + let is_last = (chunk_num + 1) * 8 == self.data.len(); + + let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + (tinyset, is_last) + }) + } + + /// Iterate over the positions of the unset elements. + /// + #[inline] + pub fn iter_unset<'a>(&'a self) -> impl Iterator + 'a { + self.iter_tinysets() + .enumerate() + .flat_map(move |(chunk_num, (tinyset, _))| { + let chunk_base_val = chunk_num as u32 * 64; + tinyset + .into_iter() + .map(move |val| val + chunk_base_val) + .take_while(move |doc| *doc < self.max_value) + }) + } + + /// Returns true iff the elements is in the `BitSet`. + #[inline] + pub fn contains(&self, el: u32) -> bool { + let byte_offset = el / 8u32; + let b: u8 = self.data[byte_offset as usize]; + let shift = (el % 8) as u8; + b & (1u8 << shift) != 0 + } + + /// Returns the max_value. + #[inline] + pub fn max_value(&self) -> u32 { + self.max_value + } +} + #[cfg(test)] mod tests { use super::BitSet; + use super::ReadSerializedBitSet; use super::TinySet; + use ownedbytes::OwnedBytes; use rand::distributions::Bernoulli; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use std::collections::HashSet; use std::convert::TryInto; + #[test] + fn test_read_serialized_bitset_full() { + let mut bitset = BitSet::with_max_value_and_full(5); + bitset.remove(3); + let mut out = vec![]; + bitset.serialize(&mut out).unwrap(); + + let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + assert_eq!(bitset.count_unset(), 1); + } + + #[test] + fn test_read_serialized_bitset_empty() { + let mut bitset = BitSet::with_max_value(5); + bitset.insert(3); + let mut out = vec![]; + bitset.serialize(&mut out).unwrap(); + + let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + assert_eq!(bitset.count_unset(), 4); + + { + let bitset = BitSet::with_max_value(5); + let mut out = vec![]; + bitset.serialize(&mut out).unwrap(); + + let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + assert_eq!(bitset.count_unset(), 5); + } + } + #[test] fn test_tiny_set_remove() { { @@ -452,7 +520,7 @@ mod tests { assert_eq!(hashset.contains(&el), bitset.contains(el)); } assert_eq!(bitset.max_value(), max_value); - assert_eq!(bitset.num_set_bits(), els.len()); + assert_eq!(bitset.len(), els.len()); }; test_against_hashset(&[], 0); @@ -506,25 +574,25 @@ mod tests { #[test] fn test_bitset_len() { let mut bitset = BitSet::with_max_value(1_000); - assert_eq!(bitset.num_set_bits(), 0); + assert_eq!(bitset.len(), 0); bitset.insert(3u32); - assert_eq!(bitset.num_set_bits(), 1); + assert_eq!(bitset.len(), 1); bitset.insert(103u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.insert(3u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.insert(103u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.insert(104u32); - assert_eq!(bitset.num_set_bits(), 3); + assert_eq!(bitset.len(), 3); bitset.remove(105u32); - assert_eq!(bitset.num_set_bits(), 3); + assert_eq!(bitset.len(), 3); bitset.remove(104u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.remove(3u32); - assert_eq!(bitset.num_set_bits(), 1); + assert_eq!(bitset.len(), 1); bitset.remove(103u32); - assert_eq!(bitset.num_set_bits(), 0); + assert_eq!(bitset.len(), 0); } pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 8ec303b6e..b81155646 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -73,7 +73,7 @@ impl SegmentReader { /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { self.alive_bitset() - .map(|delete_set| delete_set.num_deleted() as DocId) + .map(|alive_set| alive_set.num_deleted() as DocId) .unwrap_or(0u32) } @@ -289,7 +289,7 @@ impl SegmentReader { /// Returns an iterator that will iterate over the alive document ids pub fn doc_ids_alive(&self) -> Box + '_> { if let Some(alive_bitset) = &self.alive_bitset_opt { - Box::new(alive_bitset.iter_unset()) + Box::new(alive_bitset.iter_alive()) } else { Box::new(0u32..self.max_doc) } diff --git a/src/docset.rs b/src/docset.rs index 0df231e23..e5430b207 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -89,7 +89,7 @@ pub trait DocSet: Send { let mut count = 0u32; let mut doc = self.doc(); while doc != TERMINATED { - if !alive_bitset.is_deleted(doc) { + if alive_bitset.is_alive(doc) { count += 1u32; } doc = self.advance(); diff --git a/src/fastfield/alive_bitset.rs b/src/fastfield/alive_bitset.rs index 3f8136c23..108eb24eb 100644 --- a/src/fastfield/alive_bitset.rs +++ b/src/fastfield/alive_bitset.rs @@ -3,6 +3,7 @@ use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; +use common::ReadSerializedBitSet; use std::io; use std::io::Write; @@ -21,16 +22,17 @@ pub fn write_alive_bitset(alive_bitset: &BitSet, writer: &mut T) -> io pub struct AliveBitSet { data: OwnedBytes, num_deleted: usize, + bitset: ReadSerializedBitSet, } impl AliveBitSet { #[cfg(test)] - pub(crate) fn for_test(not_alive_docs: &[DocId], max_doc: u32) -> AliveBitSet { + pub(crate) fn for_test(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet { use crate::directory::{Directory, RamDirectory, TerminatingWrite}; use std::path::Path; - assert!(not_alive_docs.iter().all(|&doc| doc < max_doc)); - let mut bitset = BitSet::with_max_value_and_filled(max_doc); - for &doc in not_alive_docs { + assert!(deleted_docs.iter().all(|&doc| doc < max_doc)); + let mut bitset = BitSet::with_max_value_and_full(max_doc); + for &doc in deleted_docs { bitset.remove(doc); } let directory = RamDirectory::create(); @@ -45,32 +47,38 @@ impl AliveBitSet { /// Opens a delete bitset given its file. pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let num_deleted = BitSet::count_unset_from_bytes(bytes.as_slice()); + let bitset = ReadSerializedBitSet::new(bytes.clone()); + let num_deleted = bitset.count_unset(); Ok(AliveBitSet { data: bytes, num_deleted, + bitset, }) } /// Returns true iff the document is still "alive". In other words, if it has not been deleted. #[inline] pub fn is_alive(&self, doc: DocId) -> bool { - !self.is_deleted(doc) + self.bitset.contains(doc) } /// Returns true iff the document has been marked as deleted. #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { - let data = self.data.as_slice(); - !BitSet::contains_from_bytes(doc, data) + !self.is_alive(doc) } - /// Iterate over the positions of the set elements + /// Iterate over the alive docids. #[inline] - pub fn iter_unset(&self) -> impl Iterator + '_ { - let data = self.data.as_slice(); - BitSet::iter_unset_from_bytes(data) + pub fn iter_alive(&self) -> impl Iterator + '_ { + self.bitset.iter_unset() + } + + /// Get underlying bitset + #[inline] + pub fn bitset(&self) -> &ReadSerializedBitSet { + &self.bitset } /// The number of deleted docs @@ -121,7 +129,7 @@ mod tests { fn test_alive_bitset_iter_minimal() { let alive_bitset = AliveBitSet::for_test(&[7], 8); - let data: Vec<_> = alive_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); } @@ -129,14 +137,14 @@ mod tests { fn test_alive_bitset_iter_small() { let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7); - let data: Vec<_> = alive_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![1, 4, 5]); } #[test] fn test_alive_bitset_iter() { let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001); - let data: Vec<_> = alive_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, (2..=999).collect::>()); } } @@ -166,7 +174,7 @@ mod bench { fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); - bench.iter(|| alive_bitset.iter_unset().collect::>()); + bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] @@ -184,7 +192,7 @@ mod bench { fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); - bench.iter(|| alive_bitset.iter_unset().collect::>()); + bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index c37d41b0a..6250db86a 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -151,7 +151,7 @@ pub(crate) fn advance_deletes( let max_doc = segment_reader.max_doc(); let mut alive_bitset: BitSet = match segment_entry.alive_bitset() { Some(previous_alive_bitset) => (*previous_alive_bitset).clone(), - None => BitSet::with_max_value_and_filled(max_doc), + None => BitSet::with_max_value_and_full(max_doc), }; let num_deleted_docs_before = segment.meta().num_deleted_docs(); @@ -175,7 +175,7 @@ pub(crate) fn advance_deletes( } } - let num_alive_docs: u32 = alive_bitset.num_set_bits() as u32; + let num_alive_docs: u32 = alive_bitset.len() as u32; let num_deleted_docs = max_doc - num_alive_docs; if num_deleted_docs > num_deleted_docs_before { // There are new deletes. We need to write a new delete file. @@ -259,7 +259,7 @@ fn apply_deletes( let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps); let max_doc = segment.meta().max_doc(); - let mut deleted_bitset = BitSet::with_max_value_and_filled(max_doc); + let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc); let may_have_deletes = compute_deleted_bitset( &mut deleted_bitset, &segment_reader, diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 84151c8b1..8932c6790 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -99,22 +99,21 @@ fn compute_min_max_val( segment_reader: &SegmentReader, ) -> Option<(u64, u64)> { if segment_reader.max_doc() == 0 { - None - } else { - if segment_reader.alive_bitset().is_some() { - // some deleted documents, - // we need to recompute the max / min - minmax( - segment_reader - .doc_ids_alive() - .map(|doc_id| u64_reader.get(doc_id)), - ) - } else { - // no deleted documents, - // we can use the previous min_val, max_val. - Some((u64_reader.min_value(), u64_reader.max_value())) - } + return None; } + + if segment_reader.alive_bitset().is_none() { + // no deleted documents, + // we can use the previous min_val, max_val. + return Some((u64_reader.min_value(), u64_reader.max_value())); + } + // some deleted documents, + // we need to recompute the max / min + minmax( + segment_reader + .doc_ids_alive() + .map(|doc_id| u64_reader.get(doc_id)), + ) } struct TermOrdinalMapping { diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index ebd6e7b36..030fdeae7 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -90,7 +90,7 @@ impl DocSet for BitSetDocSet { /// but we don't have access to any better /// value. fn size_hint(&self) -> u32 { - self.docs.num_set_bits() as u32 + self.docs.len() as u32 } } @@ -124,7 +124,7 @@ mod tests { for i in 0..100_000 { assert_eq!(btreeset.contains(&i), bitset.contains(i)); } - assert_eq!(btreeset.len(), bitset.num_set_bits()); + assert_eq!(btreeset.len(), bitset.len()); let mut bitset_docset = BitSetDocSet::from(bitset); let mut remaining = true; for el in btreeset.into_iter() { diff --git a/src/query/union.rs b/src/query/union.rs index da6da15c0..cf7b4d956 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -219,18 +219,14 @@ where } let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS] .iter() - .map(|bitset| bitset.num_set()) + .map(|bitset| bitset.len()) .sum::() + 1; for bitset in self.bitsets.iter_mut() { bitset.clear(); } while self.refill() { - count += self - .bitsets - .iter() - .map(|bitset| bitset.num_set()) - .sum::(); + count += self.bitsets.iter().map(|bitset| bitset.len()).sum::(); for bitset in self.bitsets.iter_mut() { bitset.clear(); }