diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 6ff97e4a3..2de5ab5e8 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -309,7 +309,7 @@ impl BitSet { } } -/// Lazy Read a serialized BitSet. +/// Serialized BitSet. #[derive(Clone)] pub struct ReadSerializedBitSet { data: OwnedBytes, @@ -323,15 +323,12 @@ impl ReadSerializedBitSet { ReadSerializedBitSet { data, max_value } } - /// Count the number of unset bits from serialized data. - /// + /// Number of elements in the bitset. #[inline] - pub fn count_unset(&self) -> usize { - let num_set: usize = self - .iter_tinysets() + pub fn len(&self) -> usize { + self.iter_tinysets() .map(|tinyset| tinyset.len() as usize) - .sum(); - self.max_value as usize - num_set + .sum() } /// Iterate the tinyset on the fly from serialized data. @@ -369,7 +366,11 @@ impl ReadSerializedBitSet { b & (1u8 << shift) != 0 } - /// Returns the max_value. + /// Maximum value the bitset may contain. + /// (Note this is not the maximum value contained in the set.) + /// + /// A bitset has an intrinsic capacity. + /// It only stores elements within [0..max_value). #[inline] pub fn max_value(&self) -> u32 { self.max_value @@ -397,7 +398,7 @@ mod tests { bitset.serialize(&mut out).unwrap(); let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out)); - assert_eq!(bitset.count_unset(), 1); + assert_eq!(bitset.len(), 4); } #[test] @@ -408,15 +409,14 @@ mod tests { bitset.serialize(&mut out).unwrap(); let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out)); - assert_eq!(bitset.count_unset(), 4); + assert_eq!(bitset.len(), 1); { let bitset = BitSet::with_max_value(5); let mut out = vec![]; bitset.serialize(&mut out).unwrap(); - let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out)); - assert_eq!(bitset.count_unset(), 5); + assert_eq!(bitset.len(), 0); } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index b81155646..0180868ea 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -72,14 +72,12 @@ impl SegmentReader { /// Return the number of documents that have been /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { - self.alive_bitset() - .map(|alive_set| alive_set.num_deleted() as DocId) - .unwrap_or(0u32) + self.max_doc - self.num_docs } /// Returns true iff some of the documents of the segment have been deleted. pub fn has_deletes(&self) -> bool { - self.alive_bitset().is_some() + self.num_deleted_docs() > 0 } /// Accessor to a segment's fast field reader given a field. @@ -171,8 +169,8 @@ impl SegmentReader { let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; let alive_bitset_opt = if segment.meta().has_deletes() { - let delete_data = segment.open_read(SegmentComponent::Delete)?; - let alive_bitset = AliveBitSet::open(delete_data)?; + let alive_bitset_bytes = segment.open_read(SegmentComponent::Delete)?.read_bytes()?; + let alive_bitset = AliveBitSet::open(alive_bitset_bytes); Some(alive_bitset) } else { None diff --git a/src/fastfield/alive_bitset.rs b/src/fastfield/alive_bitset.rs index 0ab4513a2..8cde895f7 100644 --- a/src/fastfield/alive_bitset.rs +++ b/src/fastfield/alive_bitset.rs @@ -1,9 +1,8 @@ -use crate::directory::FileSlice; -use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; use common::ReadSerializedBitSet; +use ownedbytes::OwnedBytes; use std::io; use std::io::Write; @@ -20,41 +19,34 @@ pub fn write_alive_bitset(alive_bitset: &BitSet, writer: &mut T) -> io /// Set of alive `DocId`s. #[derive(Clone)] pub struct AliveBitSet { - data: OwnedBytes, - num_deleted: usize, + num_alive_docs: usize, bitset: ReadSerializedBitSet, + num_bytes: ByteCount, } impl AliveBitSet { #[cfg(test)] - pub(crate) fn for_test(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet { - use crate::directory::{Directory, RamDirectory, TerminatingWrite}; - use std::path::Path; + pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet { assert!(deleted_docs.iter().all(|&doc| doc < max_doc)); let mut bitset = BitSet::with_max_value_and_full(max_doc); for &doc in deleted_docs { bitset.remove(doc); } - let directory = RamDirectory::create(); - let path = Path::new("dummydeletebitset"); - let mut wrt = directory.open_write(path).unwrap(); - write_alive_bitset(&bitset, &mut wrt).unwrap(); - wrt.terminate().unwrap(); - let file = directory.open_read(path).unwrap(); - Self::open(file).unwrap() + let mut alive_bitset_buffer = Vec::new(); + write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap(); + let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer); + Self::open(alive_bitset_bytes) } /// Opens a delete bitset given its file. - pub fn open(file: FileSlice) -> crate::Result { - let bytes = file.read_bytes()?; - let bitset = ReadSerializedBitSet::open(bytes.clone()); - let num_deleted = bitset.count_unset(); - - Ok(AliveBitSet { - data: bytes, - num_deleted, + pub fn open(bytes: OwnedBytes) -> AliveBitSet { + let num_bytes = bytes.len(); + let bitset = ReadSerializedBitSet::open(bytes); + AliveBitSet { + num_alive_docs: bitset.len(), bitset, - }) + num_bytes, + } } /// Returns true iff the document is still "alive". In other words, if it has not been deleted. @@ -82,12 +74,13 @@ impl AliveBitSet { } /// The number of deleted docs - pub fn num_deleted(&self) -> usize { - self.num_deleted + pub fn num_alive_docs(&self) -> usize { + self.num_alive_docs } + /// Summarize total space usage of this bitset. pub fn space_usage(&self) -> ByteCount { - self.data.len() + self.num_bytes } } @@ -98,16 +91,17 @@ mod tests { #[test] fn test_alive_bitset_empty() { - let alive_bitset = AliveBitSet::for_test(&[], 10); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10); for doc in 0..10 { assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc)); + assert!(!alive_bitset.is_deleted(doc)); } - assert_eq!(alive_bitset.num_deleted(), 0); + assert_eq!(alive_bitset.num_alive_docs(), 10); } #[test] fn test_alive_bitset() { - let alive_bitset = AliveBitSet::for_test(&[1, 9], 10); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10); assert!(alive_bitset.is_alive(0)); assert!(alive_bitset.is_deleted(1)); assert!(alive_bitset.is_alive(2)); @@ -122,12 +116,12 @@ mod tests { for doc in 0..10 { assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc)); } - assert_eq!(alive_bitset.num_deleted(), 2); + assert_eq!(alive_bitset.num_alive_docs(), 8); } #[test] fn test_alive_bitset_iter_minimal() { - let alive_bitset = AliveBitSet::for_test(&[7], 8); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8); let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); @@ -135,14 +129,14 @@ mod tests { #[test] fn test_alive_bitset_iter_small() { - let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7); let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![1, 4, 5]); } #[test] fn test_alive_bitset_iter() { - let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001); let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, (2..=999).collect::>()); @@ -172,14 +166,14 @@ mod bench { #[bench] fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { - let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000); bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] fn bench_deletebitset_access(bench: &mut Bencher) { - let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000); bench.iter(|| { (0..1_000_000_u32) @@ -190,14 +184,14 @@ mod bench { #[bench] fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { - let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000); bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) { - let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000); bench.iter(|| { (0..1_000_000_u32) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 515433a9d..cef8380e6 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -854,7 +854,7 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); - assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0); + assert_eq!(searcher.segment_reader(0u32).num_docs(), 2); index_writer.delete_term(Term::from_field_text(text_field, "hello1")); assert!(index_writer.commit().is_ok()); @@ -862,7 +862,7 @@ mod tests { assert!(reader.reload().is_ok()); let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); - assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1); + assert_eq!(searcher.segment_reader(0u32).num_docs(), 1); let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp(); @@ -874,7 +874,7 @@ mod tests { assert!(reader.reload().is_ok()); let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); - assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1); + assert_eq!(searcher.segment_reader(0u32).num_docs(), 1); let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp(); assert_eq!(after_delete_opstamp, previous_delete_opstamp); @@ -1644,7 +1644,7 @@ mod tests { let segment_reader = searcher.segment_reader(0); assert_eq!(segment_reader.max_doc(), 2); - assert_eq!(segment_reader.num_deleted_docs(), 1); + assert_eq!(segment_reader.num_docs(), 1); Ok(()) } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index fd9af3607..66bd30de7 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -479,7 +479,7 @@ impl IndexMerger { let mut num_docs = 0; for (reader, u64s_reader) in reader_and_field_accessors.iter() { if let Some(alive_bitset) = reader.alive_bitset() { - num_docs += reader.max_doc() as u64 - alive_bitset.num_deleted() as u64; + num_docs += alive_bitset.num_alive_docs() as u64; for doc in reader.doc_ids_alive() { let num_vals = u64s_reader.get_len(doc) as u64; total_num_vals += num_vals; @@ -1012,7 +1012,7 @@ impl IndexMerger { } else { for reader in &self.readers { let store_reader = reader.get_store_reader()?; - if reader.num_deleted_docs() > 0 + if reader.has_deletes() // If there is not enough data in the store, we avoid stacking in order to // avoid creating many small blocks in the doc store. Once we have 5 full blocks, // we start stacking. In the worst case 2/7 of the blocks would be very small. diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index f7af1c697..7443d1e38 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -257,7 +257,7 @@ mod tests { .unwrap(); assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = AliveBitSet::for_test(&[0], 100); + let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( segment_reader.alive_bitset().unwrap_or(&fallback_bitset) @@ -336,7 +336,7 @@ mod tests { .unwrap() .unwrap(); assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = AliveBitSet::for_test(&[0], 100); + let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( segment_reader.alive_bitset().unwrap_or(&fallback_bitset) @@ -446,7 +446,7 @@ mod tests { .unwrap(); assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = AliveBitSet::for_test(&[0], 100); + let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( segment_reader.alive_bitset().unwrap_or(&fallback_bitset) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index f5e383c37..d42c46786 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -296,9 +296,10 @@ mod tests { fn test_doc_freq() { let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); assert_eq!(docs.doc_freq(), 3); - let alive_bitset = AliveBitSet::for_test(&[2], 12); + let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12); assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2); - let all_deleted = AliveBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); + let all_deleted = + AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0); } } diff --git a/src/store/mod.rs b/src/store/mod.rs index 364ed5a92..c6a1dda7d 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -113,7 +113,8 @@ pub mod tests { fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> { // this will cover deletion of the first element in a checkpoint let deleted_docids = (200..300).collect::>(); - let alive_bitset = AliveBitSet::for_test(&deleted_docids, NUM_DOCS as u32); + let alive_bitset = + AliveBitSet::for_test_from_deleted_docs(&deleted_docids, NUM_DOCS as u32); let path = Path::new("store"); let directory = RamDirectory::create();