From 4ae1d87632428df06c6401bd303734dfae052270 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 15 Sep 2021 20:14:28 +0800 Subject: [PATCH 01/13] add DeleteBitSet iterator --- src/core/segment_reader.rs | 15 ++++++++-- src/fastfield/delete.rs | 53 ++++++++++++++++++++++++++++++++++-- src/indexer/merger.rs | 38 ++++++++++++-------------- src/indexer/segment_entry.rs | 2 -- 4 files changed, 81 insertions(+), 27 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 73de5fb4c..45ee859f1 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -287,8 +287,19 @@ impl SegmentReader { } /// Returns an iterator that will iterate over the alive document ids - pub fn doc_ids_alive(&self) -> impl Iterator + '_ { - (0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc)) + pub fn doc_ids_alive(&self) -> Box + '_> { + if let Some(delete_bitset) = &self.delete_bitset_opt { + Box::new( + delete_bitset + .iter() + .take(self.max_doc() as usize) + .enumerate() + .filter(|(_docid, is_deleted)| !is_deleted) + .map(|(docid, _is_deleted)| docid as DocId), + ) + } else { + Box::new(0u32..self.max_doc) + } } /// Summarize total space usage of this segment. diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 421761d63..f695f9968 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -1,6 +1,5 @@ use crate::directory::FileSlice; use crate::directory::OwnedBytes; -use crate::directory::WritePtr; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; @@ -16,7 +15,7 @@ use std::io::Write; pub fn write_delete_bitset( delete_bitset: &BitSet, max_doc: u32, - writer: &mut WritePtr, + writer: &mut dyn Write, ) -> io::Result<()> { let mut byte = 0u8; let mut shift = 0u8; @@ -79,6 +78,7 @@ impl DeleteBitSet { } /// Returns true iff the document is still "alive". In other words, if it has not been deleted. + #[inline] pub fn is_alive(&self, doc: DocId) -> bool { !self.is_deleted(doc) } @@ -92,6 +92,22 @@ impl DeleteBitSet { b & (1u8 << shift) != 0 } + /// Returns true iff the document has been marked as deleted. + #[inline] + pub fn iter(&self) -> impl Iterator + '_ { + let data = self.data.as_slice(); + data.iter().flat_map(|el| { + (0..8).map(move |pos| { + let val = el >> pos; + if (val & 1) == 1 { + true + } else { + false + } + }) + }) + } + /// The number of deleted docs pub fn num_deleted(&self) -> usize { self.num_deleted @@ -110,6 +126,7 @@ impl HasLen for DeleteBitSet { #[cfg(test)] mod tests { + use super::DeleteBitSet; use common::HasLen; @@ -141,4 +158,36 @@ mod tests { } assert_eq!(delete_bitset.len(), 2); } + + #[test] + fn test_delete_bitset_iter_small() { + let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7); + + let data: Vec<_> = delete_bitset.iter().collect(); + assert!(data[0]); + assert!(!data[1]); + assert!(data[2]); + assert!(data[3]); + assert!(!data[4]); + assert!(!data[5]); + assert!(data[6]); + } + #[test] + fn test_delete_bitset_iter() { + let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10], 11); + + let data: Vec<_> = delete_bitset.iter().collect(); + assert!(!data[0]); + assert!(data[1]); + assert!(data[2]); + assert!(data[3]); + assert!(!data[4]); + assert!(data[5]); + assert!(!data[6]); + assert!(!data[7]); + assert!(!data[8]); + assert!(!data[9]); + assert!(data[10]); + assert!(!data[11]); + } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index ef3874256..d603f2900 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -98,27 +98,25 @@ pub struct IndexMerger { fn compute_min_max_val( u64_reader: &impl FastFieldReader, + segment_reader: &SegmentReader, max_doc: DocId, delete_bitset_opt: Option<&DeleteBitSet>, ) -> Option<(u64, u64)> { if max_doc == 0 { None } else { - match delete_bitset_opt { - Some(delete_bitset) => { - // some deleted documents, - // we need to recompute the max / min - minmax( - (0..max_doc) - .filter(|doc_id| delete_bitset.is_alive(*doc_id)) - .map(|doc_id| u64_reader.get(doc_id)), - ) - } - None => { - // no deleted documents, - // we can use the previous min_val, max_val. - Some((u64_reader.min_value(), u64_reader.max_value())) - } + if delete_bitset_opt.is_some() { + // some deleted documents, + // we need to recompute the max / min + minmax( + segment_reader + .doc_ids_alive() + .map(|doc_id| u64_reader.get(doc_id)), + ) + } else { + // no deleted documents, + // we can use the previous min_val, max_val. + Some((u64_reader.min_value(), u64_reader.max_value())) } } } @@ -326,7 +324,7 @@ impl IndexMerger { .fast_fields() .typed_fast_field_reader(field) .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); - compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) + compute_min_max_val(&u64_reader, reader, reader.max_doc(), reader.delete_bitset()) }) .flatten() .reduce(|a, b| { @@ -505,11 +503,9 @@ impl IndexMerger { for (reader, u64s_reader) in reader_and_field_accessors.iter() { if let Some(delete_bitset) = reader.delete_bitset() { num_docs += reader.max_doc() as u64 - delete_bitset.len() as u64; - for doc in 0u32..reader.max_doc() { - if delete_bitset.is_alive(doc) { - let num_vals = u64s_reader.get_len(doc) as u64; - total_num_vals += num_vals; - } + for doc in reader.doc_ids_alive() { + let num_vals = u64s_reader.get_len(doc) as u64; + total_num_vals += num_vals; } } else { num_docs += reader.max_doc() as u64; diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index 4ac352e50..e0beb2179 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -9,8 +9,6 @@ use std::fmt; /// /// In addition to segment `meta`, /// it contains a few transient states -/// - `state` expresses whether the segment is already in the -/// middle of a merge /// - `delete_bitset` is a bitset describing /// documents that were deleted during the commit /// itself. From 4da71273e1408ba844f8ce6868ec7ab366086afb Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 17 Sep 2021 10:28:12 +0800 Subject: [PATCH 02/13] add de/serialization for bitset remove len footgun --- common/src/bitset.rs | 61 +++++++++++++++++++++++++++++++++++++++-- src/fastfield/delete.rs | 12 ++------ src/indexer/merger.rs | 3 +- 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 942a94269..c5f741829 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -1,5 +1,7 @@ -use std::fmt; +use std::convert::TryInto; +use std::io::Write; use std::u64; +use std::{fmt, io}; #[derive(Clone, Copy, Eq, PartialEq)] pub struct TinySet(u64); @@ -28,6 +30,15 @@ impl IntoIterator for TinySet { } impl TinySet { + pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { + writer.write_all(self.0.to_le_bytes().as_ref()) + } + + pub fn deserialize(data: &[u8]) -> io::Result { + let val: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); + Ok(TinySet(val)) + } + /// Returns an empty `TinySet`. pub fn empty() -> TinySet { TinySet(0u64) @@ -123,7 +134,7 @@ impl TinySet { #[derive(Clone)] pub struct BitSet { tinysets: Box<[TinySet]>, - len: usize, + len: u64, max_value: u32, } @@ -132,6 +143,41 @@ fn num_buckets(max_val: u32) -> u32 { } impl BitSet { + /// Write a `BitSet` + /// + pub fn serialize(&mut self, writer: &mut dyn Write) -> io::Result<()> { + writer.write_all(self.len.to_le_bytes().as_ref())?; + writer.write_all(self.max_value.to_le_bytes().as_ref())?; + + for tinyset in self.tinysets.iter() { + tinyset.serialize(writer)?; + } + writer.flush()?; + Ok(()) + } + + /// UnWrite a `BitSet` + /// + pub fn deserialize(&mut self, mut data: &[u8]) -> io::Result { + let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); + data = &data[8..]; + + let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); + data = &data[4..]; + + let mut tinysets = vec![]; + while !data.is_empty() { + let tinyset = TinySet::deserialize(data)?; + tinysets.push(tinyset); + data = &data[8..]; + } + Ok(BitSet { + tinysets: tinysets.into_boxed_slice(), + len, + max_value, + }) + } + /// Create a new `BitSet` that may contain elements /// within `[0, max_val[`. pub fn with_max_value(max_value: u32) -> BitSet { @@ -153,7 +199,7 @@ impl BitSet { /// Returns the number of elements in the `BitSet`. pub fn len(&self) -> usize { - self.len + self.len as usize } /// Inserts an element in the `BitSet` @@ -249,6 +295,15 @@ mod tests { assert_eq!(hashset.contains(&el), bitset.contains(el)); } assert_eq!(bitset.max_value(), max_value); + + // test deser + let mut data = vec![]; + bitset.serialize(&mut data).unwrap(); + let bitset = bitset.deserialize(&data).unwrap(); + for el in 0..max_value { + assert_eq!(hashset.contains(&el), bitset.contains(el)); + } + assert_eq!(bitset.max_value(), max_value); }; test_against_hashset(&[], 0); diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index f695f9968..eff577d2b 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -3,7 +3,6 @@ use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; -use common::HasLen; use std::io; use std::io::Write; @@ -118,17 +117,10 @@ impl DeleteBitSet { } } -impl HasLen for DeleteBitSet { - fn len(&self) -> usize { - self.num_deleted - } -} - #[cfg(test)] mod tests { use super::DeleteBitSet; - use common::HasLen; #[test] fn test_delete_bitset_empty() { @@ -136,7 +128,7 @@ mod tests { for doc in 0..10 { assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); } - assert_eq!(delete_bitset.len(), 0); + assert_eq!(delete_bitset.num_deleted(), 0); } #[test] @@ -156,7 +148,7 @@ mod tests { for doc in 0..10 { assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); } - assert_eq!(delete_bitset.len(), 2); + assert_eq!(delete_bitset.num_deleted(), 2); } #[test] diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index d603f2900..06d02859a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -29,7 +29,6 @@ use crate::{ SegmentOrdinal, }; use crate::{DocId, InvertedIndexReader, SegmentComponent}; -use common::HasLen; use itertools::Itertools; use measure_time::debug_time; use std::cmp; @@ -502,7 +501,7 @@ impl IndexMerger { let mut num_docs = 0; for (reader, u64s_reader) in reader_and_field_accessors.iter() { if let Some(delete_bitset) = reader.delete_bitset() { - num_docs += reader.max_doc() as u64 - delete_bitset.len() as u64; + num_docs += reader.max_doc() as u64 - delete_bitset.num_deleted() as u64; for doc in reader.doc_ids_alive() { let num_vals = u64s_reader.get_len(doc) as u64; total_num_vals += num_vals; From c22177a0056b8578a056c73e8c6c67eea3a4f22b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 17 Sep 2021 15:29:27 +0800 Subject: [PATCH 03/13] add iterator --- common/src/bitset.rs | 64 +++++++++++++++------- src/core/segment_reader.rs | 9 +--- src/fastfield/delete.rs | 105 ++++++++++++++++++------------------- 3 files changed, 97 insertions(+), 81 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index c5f741829..f8bbb0fd5 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -34,8 +34,8 @@ impl TinySet { writer.write_all(self.0.to_le_bytes().as_ref()) } - pub fn deserialize(data: &[u8]) -> io::Result { - let val: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); + pub fn deserialize(data: [u8; 8]) -> io::Result { + let val: u64 = u64::from_le_bytes(data); Ok(TinySet(val)) } @@ -145,9 +145,9 @@ fn num_buckets(max_val: u32) -> u32 { impl BitSet { /// Write a `BitSet` /// - pub fn serialize(&mut self, writer: &mut dyn Write) -> io::Result<()> { - writer.write_all(self.len.to_le_bytes().as_ref())?; - writer.write_all(self.max_value.to_le_bytes().as_ref())?; + pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { + //writer.write_all(self.len.to_le_bytes().as_ref())?; + //writer.write_all(self.max_value.to_le_bytes().as_ref())?; for tinyset in self.tinysets.iter() { tinyset.serialize(writer)?; @@ -156,28 +156,41 @@ impl BitSet { Ok(()) } - /// UnWrite a `BitSet` + /// Deserialize a `BitSet`. BitSet is considered immutable after deserialization. /// - pub fn deserialize(&mut self, mut data: &[u8]) -> io::Result { - let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); - data = &data[8..]; + pub fn deserialize(data: &[u8]) -> io::Result { + //let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); + //data = &data[8..]; - let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); - data = &data[4..]; + //let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); + //data = &data[4..]; let mut tinysets = vec![]; - while !data.is_empty() { - let tinyset = TinySet::deserialize(data)?; + for chunk in data.chunks_exact(8) { + let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?; tinysets.push(tinyset); - data = &data[8..]; } Ok(BitSet { tinysets: tinysets.into_boxed_slice(), - len, - max_value, + len: 0, + max_value: 0, }) } + /// Iterate over the positions of the set elements + #[inline] + pub fn iter_positions_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { + data.chunks_exact(8) + .enumerate() + .filter(|(_, tinyset)| !tinyset.is_empty()) + .flat_map(|(chunk_num, chunk)| { + let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + tinyset + .into_iter() + .map(move |val| val + chunk_num as u32 * 64) + }) + } + /// Create a new `BitSet` that may contain elements /// within `[0, max_val[`. pub fn with_max_value(max_value: u32) -> BitSet { @@ -253,6 +266,7 @@ mod tests { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use std::collections::HashSet; + use std::convert::TryInto; #[test] fn test_tiny_set() { @@ -279,6 +293,21 @@ mod tests { assert_eq!(u.pop_lowest(), Some(63u32)); assert!(u.pop_lowest().is_none()); } + { + let mut u = TinySet::empty().insert(63u32).insert(5); + assert_eq!(u.pop_lowest(), Some(5u32)); + assert_eq!(u.pop_lowest(), Some(63u32)); + assert!(u.pop_lowest().is_none()); + } + { + let u = TinySet::empty().insert(63u32).insert(5); + let mut data = vec![]; + u.serialize(&mut data).unwrap(); + let mut u = TinySet::deserialize(data[..8].try_into().unwrap()).unwrap(); + assert_eq!(u.pop_lowest(), Some(5u32)); + assert_eq!(u.pop_lowest(), Some(63u32)); + assert!(u.pop_lowest().is_none()); + } } #[test] @@ -299,11 +328,10 @@ mod tests { // test deser let mut data = vec![]; bitset.serialize(&mut data).unwrap(); - let bitset = bitset.deserialize(&data).unwrap(); + let bitset = BitSet::deserialize(&data).unwrap(); for el in 0..max_value { assert_eq!(hashset.contains(&el), bitset.contains(el)); } - assert_eq!(bitset.max_value(), max_value); }; test_against_hashset(&[], 0); diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 45ee859f1..c80471663 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -289,14 +289,7 @@ impl SegmentReader { /// Returns an iterator that will iterate over the alive document ids pub fn doc_ids_alive(&self) -> Box + '_> { if let Some(delete_bitset) = &self.delete_bitset_opt { - Box::new( - delete_bitset - .iter() - .take(self.max_doc() as usize) - .enumerate() - .filter(|(_docid, is_deleted)| !is_deleted) - .map(|(docid, _is_deleted)| docid as DocId), - ) + Box::new(delete_bitset.iter_positions()) } else { Box::new(0u32..self.max_doc) } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index eff577d2b..bc1b286a1 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -3,6 +3,8 @@ use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; +use common::TinySet; +use std::convert::TryInto; use std::io; use std::io::Write; @@ -16,23 +18,24 @@ pub fn write_delete_bitset( max_doc: u32, writer: &mut dyn Write, ) -> io::Result<()> { - let mut byte = 0u8; - let mut shift = 0u8; - for doc in 0..max_doc { - if delete_bitset.contains(doc) { - byte |= 1 << shift; - } - if shift == 7 { - writer.write_all(&[byte])?; - shift = 0; - byte = 0; - } else { - shift += 1; - } - } - if max_doc % 8 > 0 { - writer.write_all(&[byte])?; - } + delete_bitset.serialize(writer)?; + //let mut byte = 0u8; + //let mut shift = 0u8; + //for doc in 0..max_doc { + //if delete_bitset.contains(doc) { + //byte |= 1 << shift; + //} + //if shift == 7 { + //writer.write_all(&[byte])?; + //shift = 0; + //byte = 0; + //} else { + //shift += 1; + //} + //} + //if max_doc % 8 > 0 { + //writer.write_all(&[byte])?; + //} Ok(()) } @@ -65,11 +68,14 @@ impl DeleteBitSet { /// Opens a delete bitset given its file. pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let num_deleted: usize = bytes - .as_slice() - .iter() - .map(|b| b.count_ones() as usize) + let num_deleted = bytes + .chunks_exact(8) + .map(|chunk| { + let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + tinyset.len() as usize + }) .sum(); + Ok(DeleteBitSet { data: bytes, num_deleted, @@ -91,20 +97,11 @@ impl DeleteBitSet { b & (1u8 << shift) != 0 } - /// Returns true iff the document has been marked as deleted. + /// Iterate over the positions of the set elements #[inline] - pub fn iter(&self) -> impl Iterator + '_ { + pub fn iter_positions(&self) -> impl Iterator + '_ { let data = self.data.as_slice(); - data.iter().flat_map(|el| { - (0..8).map(move |pos| { - let val = el >> pos; - if (val & 1) == 1 { - true - } else { - false - } - }) - }) + BitSet::iter_positions_from_bytes(data) } /// The number of deleted docs @@ -151,35 +148,33 @@ mod tests { assert_eq!(delete_bitset.num_deleted(), 2); } + #[test] + fn test_delete_bitset_iter_minimal() { + let delete_bitset = DeleteBitSet::for_test(&[7], 8); + + let data: Vec<_> = delete_bitset.iter_positions().collect(); + assert_eq!(data, vec![7]); + } + #[test] fn test_delete_bitset_iter_small() { let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7); - let data: Vec<_> = delete_bitset.iter().collect(); - assert!(data[0]); - assert!(!data[1]); - assert!(data[2]); - assert!(data[3]); - assert!(!data[4]); - assert!(!data[5]); - assert!(data[6]); + let data: Vec<_> = delete_bitset.iter_positions().collect(); + assert_eq!(data, vec![0, 2, 3, 6]); } #[test] fn test_delete_bitset_iter() { - let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10], 11); + let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100], 110); - let data: Vec<_> = delete_bitset.iter().collect(); - assert!(!data[0]); - assert!(data[1]); - assert!(data[2]); - assert!(data[3]); - assert!(!data[4]); - assert!(data[5]); - assert!(!data[6]); - assert!(!data[7]); - assert!(!data[8]); - assert!(!data[9]); - assert!(data[10]); - assert!(!data[11]); + let data: Vec<_> = delete_bitset.iter_positions().collect(); + assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100]); + } + #[test] + fn test_delete_bitset_iter_empty_blocks() { + let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100, 1000], 1010); + + let data: Vec<_> = delete_bitset.iter_positions().collect(); + assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100, 1000]); } } From 93cbd52bf06002d626b11a2ea3ad1d6e55723298 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sat, 18 Sep 2021 17:35:22 +0800 Subject: [PATCH 04/13] move code to biset, add inline, add benchmark --- common/src/bitset.rs | 57 +++++++++++----- src/core/segment_reader.rs | 2 +- src/fastfield/delete.rs | 126 +++++++++++++++++++++--------------- src/indexer/index_writer.rs | 2 +- 4 files changed, 118 insertions(+), 69 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index f8bbb0fd5..df4dda632 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -16,6 +16,7 @@ pub struct TinySetIterator(TinySet); impl Iterator for TinySetIterator { type Item = u32; + #[inline] fn next(&mut self) -> Option { self.0.pop_lowest() } @@ -34,6 +35,7 @@ impl TinySet { writer.write_all(self.0.to_le_bytes().as_ref()) } + #[inline] pub fn deserialize(data: [u8; 8]) -> io::Result { let val: u64 = u64::from_le_bytes(data); Ok(TinySet(val)) @@ -48,21 +50,25 @@ impl TinySet { self.0 = 0u64; } + #[inline] /// Returns the complement of the set in `[0, 64[`. fn complement(self) -> TinySet { TinySet(!self.0) } + #[inline] /// Returns true iff the `TinySet` contains the element `el`. pub fn contains(self, el: u32) -> bool { !self.intersect(TinySet::singleton(el)).is_empty() } + #[inline] /// Returns the number of elements in the TinySet. pub fn len(self) -> u32 { self.0.count_ones() } + #[inline] /// Returns the intersection of `self` and `other` pub fn intersect(self, other: TinySet) -> TinySet { TinySet(self.0 & other.0) @@ -146,8 +152,7 @@ impl BitSet { /// Write a `BitSet` /// pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { - //writer.write_all(self.len.to_le_bytes().as_ref())?; - //writer.write_all(self.max_value.to_le_bytes().as_ref())?; + writer.write_all(self.max_value.to_le_bytes().as_ref())?; for tinyset in self.tinysets.iter() { tinyset.serialize(writer)?; @@ -158,12 +163,9 @@ impl BitSet { /// Deserialize a `BitSet`. BitSet is considered immutable after deserialization. /// - pub fn deserialize(data: &[u8]) -> io::Result { - //let len: u64 = u64::from_le_bytes(data[..8].try_into().unwrap()); - //data = &data[8..]; - - //let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); - //data = &data[4..]; + pub fn deserialize(mut data: &[u8]) -> io::Result { + let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); + data = &data[4..]; let mut tinysets = vec![]; for chunk in data.chunks_exact(8) { @@ -173,21 +175,35 @@ impl BitSet { Ok(BitSet { tinysets: tinysets.into_boxed_slice(), len: 0, - max_value: 0, + max_value, }) } - /// Iterate over the positions of the set elements + /// Iterate the tinyset on the fly from serialized data. + /// #[inline] - pub fn iter_positions_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { - data.chunks_exact(8) + pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { + data[4..].chunks_exact(8).map(move |chunk| { + let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + tinyset + }) + } + + /// Iterate over the positions of the unset elements. + /// + /// max_val needs to be provided, since the last 64 bits may + #[inline] + pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { + let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); + Self::iter_from_bytes(data) + .map(|tinyset| tinyset.complement()) .enumerate() - .filter(|(_, tinyset)| !tinyset.is_empty()) - .flat_map(|(chunk_num, chunk)| { - let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + .flat_map(move |(chunk_num, tinyset)| { + let chunk_base_val = chunk_num as u32 * 64; tinyset .into_iter() - .map(move |val| val + chunk_num as u32 * 64) + .map(move |val| val + chunk_base_val) + .take_while(move |doc| *doc < max_val) }) } @@ -227,6 +243,15 @@ impl BitSet { }; } + /// Returns true iff the elements is in the `BitSet`. + #[inline] + pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool { + let byte_offset = 4 + el / 8u32; + let b: u8 = data[byte_offset as usize]; + let shift = (el & 7u32) as u8; + b & (1u8 << shift) != 0 + } + /// Returns true iff the elements is in the `BitSet`. pub fn contains(&self, el: u32) -> bool { self.tinyset(el / 64u32).contains(el % 64) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index c80471663..5504f8c60 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -289,7 +289,7 @@ impl SegmentReader { /// Returns an iterator that will iterate over the alive document ids pub fn doc_ids_alive(&self) -> Box + '_> { if let Some(delete_bitset) = &self.delete_bitset_opt { - Box::new(delete_bitset.iter_positions()) + Box::new(delete_bitset.iter_unset()) } else { Box::new(0u32..self.max_doc) } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index bc1b286a1..22af3a68b 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -3,8 +3,6 @@ use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; -use common::TinySet; -use std::convert::TryInto; use std::io; use std::io::Write; @@ -13,29 +11,8 @@ use std::io::Write; /// where `delete_bitset` is the set of deleted `DocId`. /// Warning: this function does not call terminate. The caller is in charge of /// closing the writer properly. -pub fn write_delete_bitset( - delete_bitset: &BitSet, - max_doc: u32, - writer: &mut dyn Write, -) -> io::Result<()> { +pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut dyn Write) -> io::Result<()> { delete_bitset.serialize(writer)?; - //let mut byte = 0u8; - //let mut shift = 0u8; - //for doc in 0..max_doc { - //if delete_bitset.contains(doc) { - //byte |= 1 << shift; - //} - //if shift == 7 { - //writer.write_all(&[byte])?; - //shift = 0; - //byte = 0; - //} else { - //shift += 1; - //} - //} - //if max_doc % 8 > 0 { - //writer.write_all(&[byte])?; - //} Ok(()) } @@ -59,7 +36,7 @@ impl DeleteBitSet { let directory = RamDirectory::create(); let path = Path::new("dummydeletebitset"); let mut wrt = directory.open_write(path).unwrap(); - write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap(); + write_delete_bitset(&bitset, &mut wrt).unwrap(); wrt.terminate().unwrap(); let file = directory.open_read(path).unwrap(); Self::open(file).unwrap() @@ -68,12 +45,8 @@ impl DeleteBitSet { /// Opens a delete bitset given its file. pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let num_deleted = bytes - .chunks_exact(8) - .map(|chunk| { - let tinyset = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); - tinyset.len() as usize - }) + let num_deleted = BitSet::iter_from_bytes(bytes.as_slice()) + .map(|tinyset| tinyset.len() as usize) .sum(); Ok(DeleteBitSet { @@ -91,17 +64,15 @@ impl DeleteBitSet { /// Returns true iff the document has been marked as deleted. #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { - let byte_offset = doc / 8u32; - let b: u8 = self.data.as_slice()[byte_offset as usize]; - let shift = (doc & 7u32) as u8; - b & (1u8 << shift) != 0 + let data = self.data.as_slice(); + BitSet::contains_from_bytes(doc, data) } /// Iterate over the positions of the set elements #[inline] - pub fn iter_positions(&self) -> impl Iterator + '_ { + pub fn iter_unset(&self) -> impl Iterator + '_ { let data = self.data.as_slice(); - BitSet::iter_positions_from_bytes(data) + BitSet::iter_unset_from_bytes(data) } /// The number of deleted docs @@ -152,29 +123,82 @@ mod tests { fn test_delete_bitset_iter_minimal() { let delete_bitset = DeleteBitSet::for_test(&[7], 8); - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![7]); + let data: Vec<_> = delete_bitset.iter_unset().collect(); + assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); } #[test] fn test_delete_bitset_iter_small() { let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7); - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![0, 2, 3, 6]); + let data: Vec<_> = delete_bitset.iter_unset().collect(); + assert_eq!(data, vec![1, 4, 5]); } #[test] fn test_delete_bitset_iter() { - let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100], 110); + let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000], 1001); - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100]); - } - #[test] - fn test_delete_bitset_iter_empty_blocks() { - let delete_bitset = DeleteBitSet::for_test(&[1, 2, 3, 5, 10, 64, 65, 66, 100, 1000], 1010); - - let data: Vec<_> = delete_bitset.iter_positions().collect(); - assert_eq!(data, vec![1, 2, 3, 5, 10, 64, 65, 66, 100, 1000]); + let data: Vec<_> = delete_bitset.iter_unset().collect(); + assert_eq!(data, (2..=999).collect::>()); + } +} + +#[cfg(all(test, feature = "unstable"))] +mod bench { + + use super::DeleteBitSet; + use common::BitSet; + use rand::prelude::IteratorRandom; + use rand::prelude::SliceRandom; + use rand::thread_rng; + use test::Bencher; + + fn get_many_deleted() -> Vec { + let mut data = (0..1_000_000_u32).collect::>(); + for _ in 0..(1_000_000) * 7 / 8 { + remove_rand(&mut data); + } + data + } + + fn remove_rand(raw: &mut Vec) { + let i = (0..raw.len()).choose(&mut thread_rng()).unwrap(); + raw.remove(i); + } + + #[bench] + fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + + bench.iter(|| delete_bitset.iter_unset().collect::>()); + } + + #[bench] + fn bench_deletebitset_access(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + + bench.iter(|| { + (0..1_000_000_u32) + .filter(|doc| delete_bitset.is_alive(*doc)) + .collect::>() + }); + } + + #[bench] + fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000); + + bench.iter(|| delete_bitset.iter_unset().collect::>()); + } + + #[bench] + fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) { + let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000); + + bench.iter(|| { + (0..1_000_000_u32) + .filter(|doc| delete_bitset.is_alive(*doc)) + .collect::>() + }); } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 30dd7f4f1..c42b87080 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -180,7 +180,7 @@ pub(crate) fn advance_deletes( // There are new deletes. We need to write a new delete file. segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp); let mut delete_file = segment.open_write(SegmentComponent::Delete)?; - write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?; + write_delete_bitset(&delete_bitset, &mut delete_file)?; delete_file.terminate()?; } From beb3a5bd7325e95d5c7b150cde597b673994fd87 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sat, 18 Sep 2021 17:58:15 +0800 Subject: [PATCH 05/13] fix len --- common/src/bitset.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index df4dda632..6d9ffb109 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -149,7 +149,7 @@ fn num_buckets(max_val: u32) -> u32 { } impl BitSet { - /// Write a `BitSet` + /// serialize a `BitSet`. /// pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { writer.write_all(self.max_value.to_le_bytes().as_ref())?; @@ -161,20 +161,22 @@ impl BitSet { Ok(()) } - /// Deserialize a `BitSet`. BitSet is considered immutable after deserialization. + /// Deserialize a `BitSet`. /// pub fn deserialize(mut data: &[u8]) -> io::Result { let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); data = &data[4..]; + let mut len: u64 = 0; let mut tinysets = vec![]; for chunk in data.chunks_exact(8) { let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?; + len += tinyset.len() as u64; tinysets.push(tinyset); } Ok(BitSet { tinysets: tinysets.into_boxed_slice(), - len: 0, + len, max_value, }) } @@ -357,6 +359,8 @@ mod tests { for el in 0..max_value { assert_eq!(hashset.contains(&el), bitset.contains(el)); } + assert_eq!(bitset.max_value(), max_value); + assert_eq!(bitset.len(), els.len()); }; test_against_hashset(&[], 0); From 4583fa270b24e812c37a22e7100a0c6d110f1d0a Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 23 Sep 2021 10:39:53 +0800 Subject: [PATCH 06/13] fixes --- common/src/bitset.rs | 3 ++- src/indexer/merger.rs | 8 +++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 6d9ffb109..6b0d99c9a 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -185,6 +185,7 @@ impl BitSet { /// #[inline] pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { + assert!((data.len() - 4) % 8 == 0); data[4..].chunks_exact(8).map(move |chunk| { let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); tinyset @@ -250,7 +251,7 @@ impl BitSet { pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool { let byte_offset = 4 + el / 8u32; let b: u8 = data[byte_offset as usize]; - let shift = (el & 7u32) as u8; + let shift = (el % 8) as u8; b & (1u8 << shift) != 0 } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 06d02859a..4d69b0915 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -98,13 +98,11 @@ pub struct IndexMerger { fn compute_min_max_val( u64_reader: &impl FastFieldReader, segment_reader: &SegmentReader, - max_doc: DocId, - delete_bitset_opt: Option<&DeleteBitSet>, ) -> Option<(u64, u64)> { - if max_doc == 0 { + if segment_reader.max_doc() == 0 { None } else { - if delete_bitset_opt.is_some() { + if segment_reader.delete_bitset().is_some() { // some deleted documents, // we need to recompute the max / min minmax( @@ -323,7 +321,7 @@ impl IndexMerger { .fast_fields() .typed_fast_field_reader(field) .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); - compute_min_max_val(&u64_reader, reader, reader.max_doc(), reader.delete_bitset()) + compute_min_max_val(&u64_reader, reader) }) .flatten() .reduce(|a, b| { From a1f5cead96f8f3b4321a9b068bbde0ff49275863 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 23 Sep 2021 20:03:57 +0800 Subject: [PATCH 07/13] AliveBitSet instead of DeleteBitSet --- common/src/bitset.rs | 133 ++++++++++++++++--- src/core/segment_reader.rs | 10 +- src/docset.rs | 8 +- src/fastfield/{delete.rs => alive_bitset.rs} | 54 ++++---- src/fastfield/mod.rs | 6 +- src/indexer/index_writer.rs | 11 +- src/indexer/merger.rs | 1 - src/indexer/merger_sorted_index_test.rs | 8 +- src/postings/segment_postings.rs | 10 +- src/query/bitset/mod.rs | 4 +- src/query/boost_query.rs | 4 +- src/store/mod.rs | 4 +- src/store/reader.rs | 6 +- 13 files changed, 177 insertions(+), 82 deletions(-) rename src/fastfield/{delete.rs => alive_bitset.rs} (75%) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 6b0d99c9a..527abed9b 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -31,7 +31,7 @@ impl IntoIterator for TinySet { } impl TinySet { - pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { + pub fn serialize(&self, writer: &mut T) -> io::Result<()> { writer.write_all(self.0.to_le_bytes().as_ref()) } @@ -42,17 +42,24 @@ impl TinySet { } /// Returns an empty `TinySet`. + #[inline] pub fn empty() -> TinySet { TinySet(0u64) } + /// Returns a full `TinySet`. + #[inline] + pub fn full() -> TinySet { + TinySet::empty().complement() + } + pub fn clear(&mut self) { self.0 = 0u64; } #[inline] /// Returns the complement of the set in `[0, 64[`. - fn complement(self) -> TinySet { + pub fn complement(self) -> TinySet { TinySet(!self.0) } @@ -68,6 +75,12 @@ impl TinySet { self.0.count_ones() } + #[inline] + /// Returns the number of elements in the TinySet. + pub fn num_unset(self) -> u32 { + self.0.count_zeros() + } + #[inline] /// Returns the intersection of `self` and `other` pub fn intersect(self, other: TinySet) -> TinySet { @@ -81,13 +94,21 @@ impl TinySet { TinySet(1u64 << u64::from(el)) } - /// Insert a new element within [0..64[ + /// Insert a new element within [0..64) #[inline] pub fn insert(self, el: u32) -> TinySet { self.union(TinySet::singleton(el)) } - /// Insert a new element within [0..64[ + /// Removes an element within [0..64) + #[inline] + pub fn remove(self, el: u32) -> TinySet { + self.intersect(TinySet::singleton(el).complement()) + } + + /// Insert a new element within [0..64) + /// + /// returns true if the bit changed #[inline] pub fn insert_mut(&mut self, el: u32) -> bool { let old = *self; @@ -95,6 +116,16 @@ impl TinySet { old != *self } + /// Remove a new element within [0..64) + /// + /// returns true if the bit changed + #[inline] + pub fn remove_mut(&mut self, el: u32) -> bool { + let old = *self; + *self = old.remove(el); + old != *self + } + /// Returns the union of two tinysets #[inline] pub fn union(self, other: TinySet) -> TinySet { @@ -151,7 +182,7 @@ fn num_buckets(max_val: u32) -> u32 { impl BitSet { /// serialize a `BitSet`. /// - pub fn serialize(&self, writer: &mut dyn Write) -> io::Result<()> { + pub fn serialize(&self, writer: &mut T) -> io::Result<()> { writer.write_all(self.max_value.to_le_bytes().as_ref())?; for tinyset in self.tinysets.iter() { @@ -163,6 +194,7 @@ impl BitSet { /// Deserialize a `BitSet`. /// + #[cfg(test)] pub fn deserialize(mut data: &[u8]) -> io::Result { let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); data = &data[4..]; @@ -181,10 +213,19 @@ impl BitSet { }) } + /// Count the number of unset bits from serialized data. + /// + #[inline] + pub fn count_unset_from_bytes<'a>(data: &'a [u8]) -> usize { + BitSet::iter_tinysets_from_bytes(data) + .map(|tinyset| tinyset.num_unset() as usize) + .sum() + } + /// Iterate the tinyset on the fly from serialized data. /// #[inline] - pub fn iter_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { + fn iter_tinysets_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { assert!((data.len() - 4) % 8 == 0); data[4..].chunks_exact(8).map(move |chunk| { let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); @@ -198,8 +239,7 @@ impl BitSet { #[inline] pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); - Self::iter_from_bytes(data) - .map(|tinyset| tinyset.complement()) + Self::iter_tinysets_from_bytes(data) .enumerate() .flat_map(move |(chunk_num, tinyset)| { let chunk_base_val = chunk_num as u32 * 64; @@ -211,7 +251,7 @@ impl BitSet { } /// Create a new `BitSet` that may contain elements - /// within `[0, max_val[`. + /// within `[0, max_val)`. pub fn with_max_value(max_value: u32) -> BitSet { let num_buckets = num_buckets(max_value); let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice(); @@ -222,6 +262,18 @@ impl BitSet { } } + /// Create a new `BitSet` that may contain elements + /// within `[0, max_val)`. + pub fn with_max_value_and_filled(max_value: u32) -> BitSet { + let num_buckets = num_buckets(max_value); + let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice(); + BitSet { + tinysets: tinybisets, + len: max_value as u64, + max_value, + } + } + /// Removes all elements from the `BitSet`. pub fn clear(&mut self) { for tinyset in self.tinysets.iter_mut() { @@ -230,7 +282,7 @@ impl BitSet { } /// Returns the number of elements in the `BitSet`. - pub fn len(&self) -> usize { + pub fn num_set_bits(&self) -> usize { self.len as usize } @@ -246,6 +298,18 @@ impl BitSet { }; } + /// Inserts an element in the `BitSet` + pub fn remove(&mut self, el: u32) { + // we do not check saturated els. + let higher = el / 64u32; + let lower = el % 64u32; + self.len -= if self.tinysets[higher as usize].remove_mut(lower) { + 1 + } else { + 0 + }; + } + /// Returns true iff the elements is in the `BitSet`. #[inline] pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool { @@ -296,6 +360,33 @@ mod tests { use std::collections::HashSet; use std::convert::TryInto; + #[test] + fn test_tiny_set_remove() { + { + let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32); + assert_eq!(u.pop_lowest(), Some(5u32)); + assert!(u.pop_lowest().is_none()); + } + { + let mut u = TinySet::empty() + .insert(63u32) + .insert(1) + .insert(5) + .remove(63u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert_eq!(u.pop_lowest(), Some(5u32)); + assert!(u.pop_lowest().is_none()); + } + { + let mut u = TinySet::empty().insert(1).remove(63u32); + assert_eq!(u.pop_lowest(), Some(1u32)); + assert!(u.pop_lowest().is_none()); + } + { + let mut u = TinySet::empty().insert(1).remove(1u32); + assert!(u.pop_lowest().is_none()); + } + } #[test] fn test_tiny_set() { assert!(TinySet::empty().is_empty()); @@ -361,7 +452,7 @@ mod tests { assert_eq!(hashset.contains(&el), bitset.contains(el)); } assert_eq!(bitset.max_value(), max_value); - assert_eq!(bitset.len(), els.len()); + assert_eq!(bitset.num_set_bits(), els.len()); }; test_against_hashset(&[], 0); @@ -415,17 +506,25 @@ mod tests { #[test] fn test_bitset_len() { let mut bitset = BitSet::with_max_value(1_000); - assert_eq!(bitset.len(), 0); + assert_eq!(bitset.num_set_bits(), 0); bitset.insert(3u32); - assert_eq!(bitset.len(), 1); + assert_eq!(bitset.num_set_bits(), 1); bitset.insert(103u32); - assert_eq!(bitset.len(), 2); + assert_eq!(bitset.num_set_bits(), 2); bitset.insert(3u32); - assert_eq!(bitset.len(), 2); + assert_eq!(bitset.num_set_bits(), 2); bitset.insert(103u32); - assert_eq!(bitset.len(), 2); + assert_eq!(bitset.num_set_bits(), 2); bitset.insert(104u32); - assert_eq!(bitset.len(), 3); + assert_eq!(bitset.num_set_bits(), 3); + bitset.remove(105u32); + assert_eq!(bitset.num_set_bits(), 3); + bitset.remove(104u32); + assert_eq!(bitset.num_set_bits(), 2); + bitset.remove(3u32); + assert_eq!(bitset.num_set_bits(), 1); + bitset.remove(103u32); + assert_eq!(bitset.num_set_bits(), 0); } pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 5504f8c60..10b224e33 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -5,7 +5,7 @@ use crate::core::SegmentId; use crate::directory::CompositeFile; use crate::directory::FileSlice; use crate::error::DataCorruption; -use crate::fastfield::DeleteBitSet; +use crate::fastfield::AliveBitSet; use crate::fastfield::FacetReader; use crate::fastfield::FastFieldReaders; use crate::fieldnorm::{FieldNormReader, FieldNormReaders}; @@ -47,7 +47,7 @@ pub struct SegmentReader { fieldnorm_readers: FieldNormReaders, store_file: FileSlice, - delete_bitset_opt: Option, + delete_bitset_opt: Option, schema: Schema, } @@ -172,7 +172,7 @@ impl SegmentReader { let delete_bitset_opt = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::Delete)?; - let delete_bitset = DeleteBitSet::open(delete_data)?; + let delete_bitset = AliveBitSet::open(delete_data)?; Some(delete_bitset) } else { None @@ -274,7 +274,7 @@ impl SegmentReader { /// Returns the bitset representing /// the documents that have been deleted. - pub fn delete_bitset(&self) -> Option<&DeleteBitSet> { + pub fn delete_bitset(&self) -> Option<&AliveBitSet> { self.delete_bitset_opt.as_ref() } @@ -307,7 +307,7 @@ impl SegmentReader { self.get_store_reader()?.space_usage(), self.delete_bitset_opt .as_ref() - .map(DeleteBitSet::space_usage) + .map(AliveBitSet::space_usage) .unwrap_or(0), )) } diff --git a/src/docset.rs b/src/docset.rs index 3c5dfdd31..72352e689 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -1,4 +1,4 @@ -use crate::fastfield::DeleteBitSet; +use crate::fastfield::AliveBitSet; use crate::DocId; use std::borrow::Borrow; use std::borrow::BorrowMut; @@ -85,7 +85,7 @@ pub trait DocSet: Send { /// Returns the number documents matching. /// Calling this method consumes the `DocSet`. - fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 { + fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { let mut count = 0u32; let mut doc = self.doc(); while doc != TERMINATED { @@ -130,7 +130,7 @@ impl<'a> DocSet for &'a mut dyn DocSet { (**self).size_hint() } - fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 { + fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { (**self).count(delete_bitset) } @@ -160,7 +160,7 @@ impl DocSet for Box { unboxed.size_hint() } - fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 { + fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.count(delete_bitset) } diff --git a/src/fastfield/delete.rs b/src/fastfield/alive_bitset.rs similarity index 75% rename from src/fastfield/delete.rs rename to src/fastfield/alive_bitset.rs index 22af3a68b..7cc098004 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/alive_bitset.rs @@ -11,27 +11,27 @@ use std::io::Write; /// where `delete_bitset` is the set of deleted `DocId`. /// Warning: this function does not call terminate. The caller is in charge of /// closing the writer properly. -pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut dyn Write) -> io::Result<()> { +pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut T) -> io::Result<()> { delete_bitset.serialize(writer)?; Ok(()) } /// Set of deleted `DocId`s. #[derive(Clone)] -pub struct DeleteBitSet { +pub struct AliveBitSet { data: OwnedBytes, num_deleted: usize, } -impl DeleteBitSet { +impl AliveBitSet { #[cfg(test)] - pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet { + pub(crate) fn for_test(not_alive_docs: &[DocId], max_doc: u32) -> AliveBitSet { use crate::directory::{Directory, RamDirectory, TerminatingWrite}; use std::path::Path; - assert!(docs.iter().all(|&doc| doc < max_doc)); - let mut bitset = BitSet::with_max_value(max_doc); - for &doc in docs { - bitset.insert(doc); + assert!(not_alive_docs.iter().all(|&doc| doc < max_doc)); + let mut bitset = BitSet::with_max_value_and_filled(max_doc); + for &doc in not_alive_docs { + bitset.remove(doc); } let directory = RamDirectory::create(); let path = Path::new("dummydeletebitset"); @@ -43,13 +43,11 @@ impl DeleteBitSet { } /// Opens a delete bitset given its file. - pub fn open(file: FileSlice) -> crate::Result { + pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let num_deleted = BitSet::iter_from_bytes(bytes.as_slice()) - .map(|tinyset| tinyset.len() as usize) - .sum(); + let num_deleted = BitSet::count_unset_from_bytes(bytes.as_slice()); - Ok(DeleteBitSet { + Ok(AliveBitSet { data: bytes, num_deleted, }) @@ -65,7 +63,7 @@ impl DeleteBitSet { #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { let data = self.data.as_slice(); - BitSet::contains_from_bytes(doc, data) + !BitSet::contains_from_bytes(doc, data) } /// Iterate over the positions of the set elements @@ -88,11 +86,11 @@ impl DeleteBitSet { #[cfg(test)] mod tests { - use super::DeleteBitSet; + use super::AliveBitSet; #[test] fn test_delete_bitset_empty() { - let delete_bitset = DeleteBitSet::for_test(&[], 10); + let delete_bitset = AliveBitSet::for_test(&[], 10); for doc in 0..10 { assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); } @@ -101,7 +99,7 @@ mod tests { #[test] fn test_delete_bitset() { - let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10); + let delete_bitset = AliveBitSet::for_test(&[1, 9], 10); assert!(delete_bitset.is_alive(0)); assert!(delete_bitset.is_deleted(1)); assert!(delete_bitset.is_alive(2)); @@ -121,7 +119,7 @@ mod tests { #[test] fn test_delete_bitset_iter_minimal() { - let delete_bitset = DeleteBitSet::for_test(&[7], 8); + let delete_bitset = AliveBitSet::for_test(&[7], 8); let data: Vec<_> = delete_bitset.iter_unset().collect(); assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); @@ -129,14 +127,14 @@ mod tests { #[test] fn test_delete_bitset_iter_small() { - let delete_bitset = DeleteBitSet::for_test(&[0, 2, 3, 6], 7); + let delete_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7); let data: Vec<_> = delete_bitset.iter_unset().collect(); assert_eq!(data, vec![1, 4, 5]); } #[test] fn test_delete_bitset_iter() { - let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000], 1001); + let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001); let data: Vec<_> = delete_bitset.iter_unset().collect(); assert_eq!(data, (2..=999).collect::>()); @@ -146,16 +144,14 @@ mod tests { #[cfg(all(test, feature = "unstable"))] mod bench { - use super::DeleteBitSet; - use common::BitSet; + use super::AliveBitSet; use rand::prelude::IteratorRandom; - use rand::prelude::SliceRandom; use rand::thread_rng; use test::Bencher; - fn get_many_deleted() -> Vec { + fn get_alive() -> Vec { let mut data = (0..1_000_000_u32).collect::>(); - for _ in 0..(1_000_000) * 7 / 8 { + for _ in 0..(1_000_000) * 1 / 8 { remove_rand(&mut data); } data @@ -168,14 +164,14 @@ mod bench { #[bench] fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { - let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); bench.iter(|| delete_bitset.iter_unset().collect::>()); } #[bench] fn bench_deletebitset_access(bench: &mut Bencher) { - let delete_bitset = DeleteBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); bench.iter(|| { (0..1_000_000_u32) @@ -186,14 +182,14 @@ mod bench { #[bench] fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { - let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000); + let delete_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); bench.iter(|| delete_bitset.iter_unset().collect::>()); } #[bench] fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) { - let delete_bitset = DeleteBitSet::for_test(&get_many_deleted(), 1_000_000); + let delete_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); bench.iter(|| { (0..1_000_000_u32) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index a3dc8c17f..73a3a475c 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -23,9 +23,9 @@ values stored. Read access performance is comparable to that of an array lookup. */ +pub use self::alive_bitset::write_delete_bitset; +pub use self::alive_bitset::AliveBitSet; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; -pub use self::delete::write_delete_bitset; -pub use self::delete::DeleteBitSet; pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::facet_reader::FacetReader; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; @@ -46,8 +46,8 @@ use crate::{ schema::Type, }; +mod alive_bitset; mod bytes; -mod delete; mod error; mod facet_reader; mod multivalued; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index c42b87080..e4623e548 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -114,7 +114,7 @@ fn compute_deleted_bitset( let mut doc_matching_deleted_term = docset.doc(); while doc_matching_deleted_term != TERMINATED { if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) { - delete_bitset.insert(doc_matching_deleted_term); + delete_bitset.remove(doc_matching_deleted_term); might_have_changed = true; } doc_matching_deleted_term = docset.advance(); @@ -151,7 +151,7 @@ pub(crate) fn advance_deletes( let max_doc = segment_reader.max_doc(); let mut delete_bitset: BitSet = match segment_entry.delete_bitset() { Some(previous_delete_bitset) => (*previous_delete_bitset).clone(), - None => BitSet::with_max_value(max_doc), + None => BitSet::with_max_value_and_filled(max_doc), }; let num_deleted_docs_before = segment.meta().num_deleted_docs(); @@ -170,12 +170,13 @@ pub(crate) fn advance_deletes( if let Some(seg_delete_bitset) = segment_reader.delete_bitset() { for doc in 0u32..max_doc { if seg_delete_bitset.is_deleted(doc) { - delete_bitset.insert(doc); + delete_bitset.remove(doc); } } } - let num_deleted_docs: u32 = delete_bitset.len() as u32; + let num_alive_docs: u32 = delete_bitset.num_set_bits() as u32; + let num_deleted_docs = max_doc - num_alive_docs; if num_deleted_docs > num_deleted_docs_before { // There are new deletes. We need to write a new delete file. segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp); @@ -259,7 +260,7 @@ fn apply_deletes( let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps); let max_doc = segment.meta().max_doc(); - let mut deleted_bitset = BitSet::with_max_value(max_doc); + let mut deleted_bitset = BitSet::with_max_value_and_filled(max_doc); let may_have_deletes = compute_deleted_bitset( &mut deleted_bitset, &segment_reader, diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 4d69b0915..a34fd541c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,6 +1,5 @@ use crate::error::DataCorruption; use crate::fastfield::CompositeFastFieldSerializer; -use crate::fastfield::DeleteBitSet; use crate::fastfield::DynamicFastFieldReader; use crate::fastfield::FastFieldDataAccess; use crate::fastfield::FastFieldReader; diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index d06f0b40d..a9950affa 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -1,6 +1,6 @@ #[cfg(test)] mod tests { - use crate::fastfield::{DeleteBitSet, FastFieldReader}; + use crate::fastfield::{AliveBitSet, FastFieldReader}; use crate::schema::IndexRecordOption; use crate::{ collector::TopDocs, @@ -257,7 +257,7 @@ mod tests { .unwrap(); assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = DeleteBitSet::for_test(&[0], 100); + let fallback_bitset = AliveBitSet::for_test(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( segment_reader.delete_bitset().unwrap_or(&fallback_bitset) @@ -336,7 +336,7 @@ mod tests { .unwrap() .unwrap(); assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = DeleteBitSet::for_test(&[0], 100); + let fallback_bitset = AliveBitSet::for_test(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( segment_reader.delete_bitset().unwrap_or(&fallback_bitset) @@ -446,7 +446,7 @@ mod tests { .unwrap(); assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = DeleteBitSet::for_test(&[0], 100); + let fallback_bitset = AliveBitSet::for_test(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( segment_reader.delete_bitset().unwrap_or(&fallback_bitset) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index aa470d99f..753737d51 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,5 +1,5 @@ use crate::docset::DocSet; -use crate::fastfield::DeleteBitSet; +use crate::fastfield::AliveBitSet; use crate::positions::PositionReader; use crate::postings::branchless_binary_search; use crate::postings::compression::COMPRESSION_BLOCK_SIZE; @@ -34,7 +34,7 @@ impl SegmentPostings { /// /// This method will clone and scan through the posting lists. /// (this is a rather expensive operation). - pub fn doc_freq_given_deletes(&self, delete_bitset: &DeleteBitSet) -> u32 { + pub fn doc_freq_given_deletes(&self, delete_bitset: &AliveBitSet) -> u32 { let mut docset = self.clone(); let mut doc_freq = 0; loop { @@ -268,7 +268,7 @@ mod tests { use common::HasLen; use crate::docset::{DocSet, TERMINATED}; - use crate::fastfield::DeleteBitSet; + use crate::fastfield::AliveBitSet; use crate::postings::postings::Postings; #[test] @@ -296,9 +296,9 @@ mod tests { fn test_doc_freq() { let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); assert_eq!(docs.doc_freq(), 3); - let delete_bitset = DeleteBitSet::for_test(&[2], 12); + let delete_bitset = AliveBitSet::for_test(&[2], 12); assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2); - let all_deleted = DeleteBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); + let all_deleted = AliveBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0); } } diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index 030fdeae7..ebd6e7b36 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -90,7 +90,7 @@ impl DocSet for BitSetDocSet { /// but we don't have access to any better /// value. fn size_hint(&self) -> u32 { - self.docs.len() as u32 + self.docs.num_set_bits() as u32 } } @@ -124,7 +124,7 @@ mod tests { for i in 0..100_000 { assert_eq!(btreeset.contains(&i), bitset.contains(i)); } - assert_eq!(btreeset.len(), bitset.len()); + assert_eq!(btreeset.len(), bitset.num_set_bits()); let mut bitset_docset = BitSetDocSet::from(bitset); let mut remaining = true; for el in btreeset.into_iter() { diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index 41c94e0f8..72bd3ebba 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -1,4 +1,4 @@ -use crate::fastfield::DeleteBitSet; +use crate::fastfield::AliveBitSet; use crate::query::explanation::does_not_match; use crate::query::{Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term}; @@ -118,7 +118,7 @@ impl DocSet for BoostScorer { self.underlying.size_hint() } - fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 { + fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { self.underlying.count(delete_bitset) } diff --git a/src/store/mod.rs b/src/store/mod.rs index 00e0c4b13..0ad341473 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -57,7 +57,7 @@ pub mod tests { use futures::executor::block_on; use super::*; - use crate::fastfield::DeleteBitSet; + use crate::fastfield::AliveBitSet; use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT}; use crate::schema::{Document, TextOptions}; use crate::{ @@ -113,7 +113,7 @@ pub mod tests { fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> { // this will cover deletion of the first element in a checkpoint let deleted_docids = (200..300).collect::>(); - let delete_bitset = DeleteBitSet::for_test(&deleted_docids, NUM_DOCS as u32); + let delete_bitset = AliveBitSet::for_test(&deleted_docids, NUM_DOCS as u32); let path = Path::new("store"); let directory = RamDirectory::create(); diff --git a/src/store/reader.rs b/src/store/reader.rs index 3ff04f691..75012718d 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -5,7 +5,7 @@ use crate::schema::Document; use crate::space_usage::StoreSpaceUsage; use crate::store::index::Checkpoint; use crate::DocId; -use crate::{error::DataCorruption, fastfield::DeleteBitSet}; +use crate::{error::DataCorruption, fastfield::AliveBitSet}; use common::{BinarySerializable, HasLen, VInt}; use lru::LruCache; use std::io; @@ -136,7 +136,7 @@ impl StoreReader { /// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong. pub fn iter<'a: 'b, 'b>( &'b self, - delete_bitset: Option<&'a DeleteBitSet>, + delete_bitset: Option<&'a AliveBitSet>, ) -> impl Iterator> + 'b { self.iter_raw(delete_bitset).map(|doc_bytes_res| { let mut doc_bytes = doc_bytes_res?; @@ -149,7 +149,7 @@ impl StoreReader { /// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong. pub(crate) fn iter_raw<'a: 'b, 'b>( &'b self, - delete_bitset: Option<&'a DeleteBitSet>, + delete_bitset: Option<&'a AliveBitSet>, ) -> impl Iterator> + 'b { let last_docid = self .block_checkpoints() From d7a6a409a108a3c539ea074f1edc3e2c335a8687 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 23 Sep 2021 20:33:11 +0800 Subject: [PATCH 08/13] renames --- src/collector/mod.rs | 4 +- src/collector/top_score_collector.rs | 4 +- src/core/segment_reader.rs | 26 ++++---- src/docset.rs | 12 ++-- src/fastfield/alive_bitset.rs | 84 ++++++++++++------------- src/fastfield/mod.rs | 2 +- src/indexer/index_writer.rs | 30 ++++----- src/indexer/merger.rs | 16 ++--- src/indexer/merger_sorted_index_test.rs | 6 +- src/indexer/segment_entry.rs | 16 ++--- src/postings/segment_postings.rs | 8 +-- src/query/boost_query.rs | 4 +- src/query/term_query/term_weight.rs | 4 +- src/query/weight.rs | 4 +- src/store/mod.rs | 10 +-- src/store/reader.rs | 12 ++-- 16 files changed, 121 insertions(+), 121 deletions(-) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index c7e64f004..bb409fb79 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -178,9 +178,9 @@ pub trait Collector: Sync + Send { ) -> crate::Result<::Fruit> { let mut segment_collector = self.for_segment(segment_ord as u32, reader)?; - if let Some(delete_bitset) = reader.delete_bitset() { + if let Some(alive_bitset) = reader.alive_bitset() { weight.for_each(reader, &mut |doc, score| { - if delete_bitset.is_alive(doc) { + if alive_bitset.is_alive(doc) { segment_collector.collect(doc, score); } })?; diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index b1786b77c..51d0a5801 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -629,10 +629,10 @@ impl Collector for TopDocs { let heap_len = self.0.limit + self.0.offset; let mut heap: BinaryHeap> = BinaryHeap::with_capacity(heap_len); - if let Some(delete_bitset) = reader.delete_bitset() { + if let Some(alive_bitset) = reader.alive_bitset() { let mut threshold = Score::MIN; weight.for_each_pruning(threshold, reader, &mut |doc, score| { - if delete_bitset.is_deleted(doc) { + if alive_bitset.is_deleted(doc) { return threshold; } let heap_item = ComparableDoc { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 10b224e33..8ec303b6e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -47,7 +47,7 @@ pub struct SegmentReader { fieldnorm_readers: FieldNormReaders, store_file: FileSlice, - delete_bitset_opt: Option, + alive_bitset_opt: Option, schema: Schema, } @@ -72,14 +72,14 @@ impl SegmentReader { /// Return the number of documents that have been /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { - self.delete_bitset() + self.alive_bitset() .map(|delete_set| delete_set.num_deleted() as DocId) .unwrap_or(0u32) } /// Returns true iff some of the documents of the segment have been deleted. pub fn has_deletes(&self) -> bool { - self.delete_bitset().is_some() + self.alive_bitset().is_some() } /// Accessor to a segment's fast field reader given a field. @@ -170,10 +170,10 @@ impl SegmentReader { let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?; let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?; - let delete_bitset_opt = if segment.meta().has_deletes() { + let alive_bitset_opt = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::Delete)?; - let delete_bitset = AliveBitSet::open(delete_data)?; - Some(delete_bitset) + let alive_bitset = AliveBitSet::open(delete_data)?; + Some(alive_bitset) } else { None }; @@ -188,7 +188,7 @@ impl SegmentReader { fieldnorm_readers, segment_id: segment.id(), store_file, - delete_bitset_opt, + alive_bitset_opt, positions_composite, schema, }) @@ -274,22 +274,22 @@ impl SegmentReader { /// Returns the bitset representing /// the documents that have been deleted. - pub fn delete_bitset(&self) -> Option<&AliveBitSet> { - self.delete_bitset_opt.as_ref() + pub fn alive_bitset(&self) -> Option<&AliveBitSet> { + self.alive_bitset_opt.as_ref() } /// Returns true iff the `doc` is marked /// as deleted. pub fn is_deleted(&self, doc: DocId) -> bool { - self.delete_bitset() + self.alive_bitset() .map(|delete_set| delete_set.is_deleted(doc)) .unwrap_or(false) } /// Returns an iterator that will iterate over the alive document ids pub fn doc_ids_alive(&self) -> Box + '_> { - if let Some(delete_bitset) = &self.delete_bitset_opt { - Box::new(delete_bitset.iter_unset()) + if let Some(alive_bitset) = &self.alive_bitset_opt { + Box::new(alive_bitset.iter_unset()) } else { Box::new(0u32..self.max_doc) } @@ -305,7 +305,7 @@ impl SegmentReader { self.fast_fields_readers.space_usage(), self.fieldnorm_readers.space_usage(), self.get_store_reader()?.space_usage(), - self.delete_bitset_opt + self.alive_bitset_opt .as_ref() .map(AliveBitSet::space_usage) .unwrap_or(0), diff --git a/src/docset.rs b/src/docset.rs index 72352e689..0df231e23 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -85,11 +85,11 @@ pub trait DocSet: Send { /// Returns the number documents matching. /// Calling this method consumes the `DocSet`. - fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { + fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { let mut count = 0u32; let mut doc = self.doc(); while doc != TERMINATED { - if !delete_bitset.is_deleted(doc) { + if !alive_bitset.is_deleted(doc) { count += 1u32; } doc = self.advance(); @@ -130,8 +130,8 @@ impl<'a> DocSet for &'a mut dyn DocSet { (**self).size_hint() } - fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { - (**self).count(delete_bitset) + fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { + (**self).count(alive_bitset) } fn count_including_deleted(&mut self) -> u32 { @@ -160,9 +160,9 @@ impl DocSet for Box { unboxed.size_hint() } - fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { + fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { let unboxed: &mut TDocSet = self.borrow_mut(); - unboxed.count(delete_bitset) + unboxed.count(alive_bitset) } fn count_including_deleted(&mut self) -> u32 { diff --git a/src/fastfield/alive_bitset.rs b/src/fastfield/alive_bitset.rs index 7cc098004..3f8136c23 100644 --- a/src/fastfield/alive_bitset.rs +++ b/src/fastfield/alive_bitset.rs @@ -6,17 +6,17 @@ use common::BitSet; use std::io; use std::io::Write; -/// Write a delete `BitSet` +/// Write a alive `BitSet` /// -/// where `delete_bitset` is the set of deleted `DocId`. +/// where `alive_bitset` is the set of alive `DocId`. /// Warning: this function does not call terminate. The caller is in charge of /// closing the writer properly. -pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut T) -> io::Result<()> { - delete_bitset.serialize(writer)?; +pub fn write_alive_bitset(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> { + alive_bitset.serialize(writer)?; Ok(()) } -/// Set of deleted `DocId`s. +/// Set of alive `DocId`s. #[derive(Clone)] pub struct AliveBitSet { data: OwnedBytes, @@ -36,7 +36,7 @@ impl AliveBitSet { let directory = RamDirectory::create(); let path = Path::new("dummydeletebitset"); let mut wrt = directory.open_write(path).unwrap(); - write_delete_bitset(&bitset, &mut wrt).unwrap(); + write_alive_bitset(&bitset, &mut wrt).unwrap(); wrt.terminate().unwrap(); let file = directory.open_read(path).unwrap(); Self::open(file).unwrap() @@ -89,54 +89,54 @@ mod tests { use super::AliveBitSet; #[test] - fn test_delete_bitset_empty() { - let delete_bitset = AliveBitSet::for_test(&[], 10); + fn test_alive_bitset_empty() { + let alive_bitset = AliveBitSet::for_test(&[], 10); for doc in 0..10 { - assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); + assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc)); } - assert_eq!(delete_bitset.num_deleted(), 0); + assert_eq!(alive_bitset.num_deleted(), 0); } #[test] - fn test_delete_bitset() { - let delete_bitset = AliveBitSet::for_test(&[1, 9], 10); - assert!(delete_bitset.is_alive(0)); - assert!(delete_bitset.is_deleted(1)); - assert!(delete_bitset.is_alive(2)); - assert!(delete_bitset.is_alive(3)); - assert!(delete_bitset.is_alive(4)); - assert!(delete_bitset.is_alive(5)); - assert!(delete_bitset.is_alive(6)); - assert!(delete_bitset.is_alive(6)); - assert!(delete_bitset.is_alive(7)); - assert!(delete_bitset.is_alive(8)); - assert!(delete_bitset.is_deleted(9)); + fn test_alive_bitset() { + let alive_bitset = AliveBitSet::for_test(&[1, 9], 10); + assert!(alive_bitset.is_alive(0)); + assert!(alive_bitset.is_deleted(1)); + assert!(alive_bitset.is_alive(2)); + assert!(alive_bitset.is_alive(3)); + assert!(alive_bitset.is_alive(4)); + assert!(alive_bitset.is_alive(5)); + assert!(alive_bitset.is_alive(6)); + assert!(alive_bitset.is_alive(6)); + assert!(alive_bitset.is_alive(7)); + assert!(alive_bitset.is_alive(8)); + assert!(alive_bitset.is_deleted(9)); for doc in 0..10 { - assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc)); + assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc)); } - assert_eq!(delete_bitset.num_deleted(), 2); + assert_eq!(alive_bitset.num_deleted(), 2); } #[test] - fn test_delete_bitset_iter_minimal() { - let delete_bitset = AliveBitSet::for_test(&[7], 8); + fn test_alive_bitset_iter_minimal() { + let alive_bitset = AliveBitSet::for_test(&[7], 8); - let data: Vec<_> = delete_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_unset().collect(); assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); } #[test] - fn test_delete_bitset_iter_small() { - let delete_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7); + fn test_alive_bitset_iter_small() { + let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7); - let data: Vec<_> = delete_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_unset().collect(); assert_eq!(data, vec![1, 4, 5]); } #[test] - fn test_delete_bitset_iter() { - let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001); + fn test_alive_bitset_iter() { + let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001); - let data: Vec<_> = delete_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_unset().collect(); assert_eq!(data, (2..=999).collect::>()); } } @@ -164,36 +164,36 @@ mod bench { #[bench] fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { - let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); - bench.iter(|| delete_bitset.iter_unset().collect::>()); + bench.iter(|| alive_bitset.iter_unset().collect::>()); } #[bench] fn bench_deletebitset_access(bench: &mut Bencher) { - let delete_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); + let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); bench.iter(|| { (0..1_000_000_u32) - .filter(|doc| delete_bitset.is_alive(*doc)) + .filter(|doc| alive_bitset.is_alive(*doc)) .collect::>() }); } #[bench] fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { - let delete_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); + let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); - bench.iter(|| delete_bitset.iter_unset().collect::>()); + bench.iter(|| alive_bitset.iter_unset().collect::>()); } #[bench] fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) { - let delete_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); + let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); bench.iter(|| { (0..1_000_000_u32) - .filter(|doc| delete_bitset.is_alive(*doc)) + .filter(|doc| alive_bitset.is_alive(*doc)) .collect::>() }); } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 73a3a475c..dd100074c 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -23,7 +23,7 @@ values stored. Read access performance is comparable to that of an array lookup. */ -pub use self::alive_bitset::write_delete_bitset; +pub use self::alive_bitset::write_alive_bitset; pub use self::alive_bitset::AliveBitSet; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index e4623e548..ff45df9d3 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -11,7 +11,7 @@ use crate::directory::TerminatingWrite; use crate::directory::{DirectoryLock, GarbageCollectionResult}; use crate::docset::{DocSet, TERMINATED}; use crate::error::TantivyError; -use crate::fastfield::write_delete_bitset; +use crate::fastfield::write_alive_bitset; use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue}; use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping; use crate::indexer::operation::DeleteOperation; @@ -93,7 +93,7 @@ pub struct IndexWriter { } fn compute_deleted_bitset( - delete_bitset: &mut BitSet, + alive_bitset: &mut BitSet, segment_reader: &SegmentReader, delete_cursor: &mut DeleteCursor, doc_opstamps: &DocToOpstampMapping, @@ -114,7 +114,7 @@ fn compute_deleted_bitset( let mut doc_matching_deleted_term = docset.doc(); while doc_matching_deleted_term != TERMINATED { if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) { - delete_bitset.remove(doc_matching_deleted_term); + alive_bitset.remove(doc_matching_deleted_term); might_have_changed = true; } doc_matching_deleted_term = docset.advance(); @@ -141,7 +141,7 @@ pub(crate) fn advance_deletes( return Ok(()); } - if segment_entry.delete_bitset().is_none() && segment_entry.delete_cursor().get().is_none() { + if segment_entry.alive_bitset().is_none() && segment_entry.delete_cursor().get().is_none() { // There has been no `DeleteOperation` between the segment status and `target_opstamp`. return Ok(()); } @@ -149,15 +149,15 @@ pub(crate) fn advance_deletes( let segment_reader = SegmentReader::open(&segment)?; let max_doc = segment_reader.max_doc(); - let mut delete_bitset: BitSet = match segment_entry.delete_bitset() { - Some(previous_delete_bitset) => (*previous_delete_bitset).clone(), + let mut alive_bitset: BitSet = match segment_entry.alive_bitset() { + Some(previous_alive_bitset) => (*previous_alive_bitset).clone(), None => BitSet::with_max_value_and_filled(max_doc), }; let num_deleted_docs_before = segment.meta().num_deleted_docs(); compute_deleted_bitset( - &mut delete_bitset, + &mut alive_bitset, &segment_reader, segment_entry.delete_cursor(), &DocToOpstampMapping::None, @@ -167,21 +167,21 @@ pub(crate) fn advance_deletes( // TODO optimize // It should be possible to do something smarter by manipulation bitsets directly // to compute this union. - if let Some(seg_delete_bitset) = segment_reader.delete_bitset() { + if let Some(seg_alive_bitset) = segment_reader.alive_bitset() { for doc in 0u32..max_doc { - if seg_delete_bitset.is_deleted(doc) { - delete_bitset.remove(doc); + if seg_alive_bitset.is_deleted(doc) { + alive_bitset.remove(doc); } } } - let num_alive_docs: u32 = delete_bitset.num_set_bits() as u32; + let num_alive_docs: u32 = alive_bitset.num_set_bits() as u32; let num_deleted_docs = max_doc - num_alive_docs; if num_deleted_docs > num_deleted_docs_before { // There are new deletes. We need to write a new delete file. segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp); let mut delete_file = segment.open_write(SegmentComponent::Delete)?; - write_delete_bitset(&delete_bitset, &mut delete_file)?; + write_alive_bitset(&alive_bitset, &mut delete_file)?; delete_file.terminate()?; } @@ -227,13 +227,13 @@ fn index_documents( let segment_with_max_doc = segment.with_max_doc(max_doc); - let delete_bitset_opt = + let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?; let meta = segment_with_max_doc.meta().clone(); meta.untrack_temp_docstore(); // update segment_updater inventory to remove tempstore - let segment_entry = SegmentEntry::new(meta, delete_cursor, delete_bitset_opt); + let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt); block_on(segment_updater.schedule_add_segment(segment_entry))?; Ok(true) } @@ -1514,7 +1514,7 @@ mod tests { for segment_reader in searcher.segment_readers().iter() { let store_reader = segment_reader.get_store_reader().unwrap(); // test store iterator - for doc in store_reader.iter(segment_reader.delete_bitset()) { + for doc in store_reader.iter(segment_reader.alive_bitset()) { let id = doc .unwrap() .get_first(id_field) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index a34fd541c..84151c8b1 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -101,7 +101,7 @@ fn compute_min_max_val( if segment_reader.max_doc() == 0 { None } else { - if segment_reader.delete_bitset().is_some() { + if segment_reader.alive_bitset().is_some() { // some deleted documents, // we need to recompute the max / min minmax( @@ -497,8 +497,8 @@ impl IndexMerger { // what should be the bit length use for bitpacking. let mut num_docs = 0; for (reader, u64s_reader) in reader_and_field_accessors.iter() { - if let Some(delete_bitset) = reader.delete_bitset() { - num_docs += reader.max_doc() as u64 - delete_bitset.num_deleted() as u64; + if let Some(alive_bitset) = reader.alive_bitset() { + num_docs += reader.max_doc() as u64 - alive_bitset.num_deleted() as u64; for doc in reader.doc_ids_alive() { let num_vals = u64s_reader.get_len(doc) as u64; total_num_vals += num_vals; @@ -888,9 +888,9 @@ impl IndexMerger { let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord]; let segment_postings = inverted_index .read_postings_from_terminfo(&term_info, segment_postings_option)?; - let delete_bitset_opt = segment_reader.delete_bitset(); - let doc_freq = if let Some(delete_bitset) = delete_bitset_opt { - segment_postings.doc_freq_given_deletes(delete_bitset) + let alive_bitset_opt = segment_reader.alive_bitset(); + let doc_freq = if let Some(alive_bitset) = alive_bitset_opt { + segment_postings.doc_freq_given_deletes(alive_bitset) } else { segment_postings.doc_freq() }; @@ -1010,7 +1010,7 @@ impl IndexMerger { let mut document_iterators: Vec<_> = store_readers .iter() .enumerate() - .map(|(i, store)| store.iter_raw(self.readers[i].delete_bitset())) + .map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset())) .collect(); if !doc_id_mapping.is_trivial() { for (old_doc_id, reader_with_ordinal) in doc_id_mapping.iter() { @@ -1046,7 +1046,7 @@ impl IndexMerger { || store_reader.block_checkpoints().take(7).count() < 6 || store_reader.compressor() != store_writer.compressor() { - for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) { + for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) { let doc_bytes = doc_bytes_res?; store_writer.store_bytes(&doc_bytes)?; } diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index a9950affa..fd9b4883b 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -260,7 +260,7 @@ mod tests { let fallback_bitset = AliveBitSet::for_test(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( - segment_reader.delete_bitset().unwrap_or(&fallback_bitset) + segment_reader.alive_bitset().unwrap_or(&fallback_bitset) ), 2 ); @@ -339,7 +339,7 @@ mod tests { let fallback_bitset = AliveBitSet::for_test(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( - segment_reader.delete_bitset().unwrap_or(&fallback_bitset) + segment_reader.alive_bitset().unwrap_or(&fallback_bitset) ), 2 ); @@ -449,7 +449,7 @@ mod tests { let fallback_bitset = AliveBitSet::for_test(&[0], 100); assert_eq!( postings.doc_freq_given_deletes( - segment_reader.delete_bitset().unwrap_or(&fallback_bitset) + segment_reader.alive_bitset().unwrap_or(&fallback_bitset) ), 2 ); diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index e0beb2179..b7cae25c1 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -9,16 +9,16 @@ use std::fmt; /// /// In addition to segment `meta`, /// it contains a few transient states -/// - `delete_bitset` is a bitset describing -/// documents that were deleted during the commit +/// - `alive_bitset` is a bitset describing +/// documents that were alive during the commit /// itself. /// - `delete_cursor` is the position in the delete queue. /// Deletes happening before the cursor are reflected either -/// in the .del file or in the `delete_bitset`. +/// in the .del file or in the `alive_bitset`. #[derive(Clone)] pub struct SegmentEntry { meta: SegmentMeta, - delete_bitset: Option, + alive_bitset: Option, delete_cursor: DeleteCursor, } @@ -27,11 +27,11 @@ impl SegmentEntry { pub fn new( segment_meta: SegmentMeta, delete_cursor: DeleteCursor, - delete_bitset: Option, + alive_bitset: Option, ) -> SegmentEntry { SegmentEntry { meta: segment_meta, - delete_bitset, + alive_bitset, delete_cursor, } } @@ -39,8 +39,8 @@ impl SegmentEntry { /// Return a reference to the segment entry deleted bitset. /// /// `DocId` in this bitset are flagged as deleted. - pub fn delete_bitset(&self) -> Option<&BitSet> { - self.delete_bitset.as_ref() + pub fn alive_bitset(&self) -> Option<&BitSet> { + self.alive_bitset.as_ref() } /// Set the `SegmentMeta` for this segment. diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 753737d51..f5e383c37 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -34,7 +34,7 @@ impl SegmentPostings { /// /// This method will clone and scan through the posting lists. /// (this is a rather expensive operation). - pub fn doc_freq_given_deletes(&self, delete_bitset: &AliveBitSet) -> u32 { + pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 { let mut docset = self.clone(); let mut doc_freq = 0; loop { @@ -42,7 +42,7 @@ impl SegmentPostings { if doc == TERMINATED { return doc_freq; } - if delete_bitset.is_alive(doc) { + if alive_bitset.is_alive(doc) { doc_freq += 1u32; } docset.advance(); @@ -296,8 +296,8 @@ mod tests { fn test_doc_freq() { let docs = SegmentPostings::create_from_docs(&[0, 2, 10]); assert_eq!(docs.doc_freq(), 3); - let delete_bitset = AliveBitSet::for_test(&[2], 12); - assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2); + let alive_bitset = AliveBitSet::for_test(&[2], 12); + assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2); let all_deleted = AliveBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12); assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0); } diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index 72bd3ebba..d7eee9efe 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -118,8 +118,8 @@ impl DocSet for BoostScorer { self.underlying.size_hint() } - fn count(&mut self, delete_bitset: &AliveBitSet) -> u32 { - self.underlying.count(delete_bitset) + fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { + self.underlying.count(alive_bitset) } fn count_including_deleted(&mut self) -> u32 { diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 877243f8a..51779124b 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -40,8 +40,8 @@ impl Weight for TermWeight { } fn count(&self, reader: &SegmentReader) -> crate::Result { - if let Some(delete_bitset) = reader.delete_bitset() { - Ok(self.scorer(reader, 1.0)?.count(delete_bitset)) + if let Some(alive_bitset) = reader.alive_bitset() { + Ok(self.scorer(reader, 1.0)?.count(alive_bitset)) } else { let field = self.term.field(); let inv_index = reader.inverted_index(field)?; diff --git a/src/query/weight.rs b/src/query/weight.rs index 772846e3e..3a2ff3d33 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -59,8 +59,8 @@ pub trait Weight: Send + Sync + 'static { /// Returns the number documents within the given `SegmentReader`. fn count(&self, reader: &SegmentReader) -> crate::Result { let mut scorer = self.scorer(reader, 1.0)?; - if let Some(delete_bitset) = reader.delete_bitset() { - Ok(scorer.count(delete_bitset)) + if let Some(alive_bitset) = reader.alive_bitset() { + Ok(scorer.count(alive_bitset)) } else { Ok(scorer.count_including_deleted()) } diff --git a/src/store/mod.rs b/src/store/mod.rs index 0ad341473..364ed5a92 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -113,7 +113,7 @@ pub mod tests { fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> { // this will cover deletion of the first element in a checkpoint let deleted_docids = (200..300).collect::>(); - let delete_bitset = AliveBitSet::for_test(&deleted_docids, NUM_DOCS as u32); + let alive_bitset = AliveBitSet::for_test(&deleted_docids, NUM_DOCS as u32); let path = Path::new("store"); let directory = RamDirectory::create(); @@ -134,7 +134,7 @@ pub mod tests { ); } - for (_, doc) in store.iter(Some(&delete_bitset)).enumerate() { + for (_, doc) in store.iter(Some(&alive_bitset)).enumerate() { let doc = doc?; let title_content = doc.get_first(field_title).unwrap().text().unwrap(); if !title_content.starts_with("Doc ") { @@ -146,7 +146,7 @@ pub mod tests { .unwrap() .parse::() .unwrap(); - if delete_bitset.is_deleted(id) { + if alive_bitset.is_deleted(id) { panic!("unexpected deleted document {}", id); } } @@ -230,7 +230,7 @@ pub mod tests { let searcher = index.reader().unwrap().searcher(); let reader = searcher.segment_reader(0); let store = reader.get_store_reader().unwrap(); - for doc in store.iter(reader.delete_bitset()) { + for doc in store.iter(reader.alive_bitset()) { assert_eq!( *doc?.get_first(text_field).unwrap().text().unwrap(), "deletemenot".to_string() @@ -288,7 +288,7 @@ pub mod tests { let reader = searcher.segment_readers().iter().last().unwrap(); let store = reader.get_store_reader().unwrap(); - for doc in store.iter(reader.delete_bitset()).take(50) { + for doc in store.iter(reader.alive_bitset()).take(50) { assert_eq!( *doc?.get_first(text_field).unwrap().text().unwrap(), LOREM.to_string() diff --git a/src/store/reader.rs b/src/store/reader.rs index 75012718d..98c127d97 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -133,12 +133,12 @@ impl StoreReader { /// Iterator over all Documents in their order as they are stored in the doc store. /// Use this, if you want to extract all Documents from the doc store. - /// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong. + /// The alive_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong. pub fn iter<'a: 'b, 'b>( &'b self, - delete_bitset: Option<&'a AliveBitSet>, + alive_bitset: Option<&'a AliveBitSet>, ) -> impl Iterator> + 'b { - self.iter_raw(delete_bitset).map(|doc_bytes_res| { + self.iter_raw(alive_bitset).map(|doc_bytes_res| { let mut doc_bytes = doc_bytes_res?; Ok(Document::deserialize(&mut doc_bytes)?) }) @@ -146,10 +146,10 @@ impl StoreReader { /// Iterator over all RawDocuments in their order as they are stored in the doc store. /// Use this, if you want to extract all Documents from the doc store. - /// The delete_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong. + /// The alive_bitset has to be forwarded from the `SegmentReader` or the results maybe wrong. pub(crate) fn iter_raw<'a: 'b, 'b>( &'b self, - delete_bitset: Option<&'a AliveBitSet>, + alive_bitset: Option<&'a AliveBitSet>, ) -> impl Iterator> + 'b { let last_docid = self .block_checkpoints() @@ -179,7 +179,7 @@ impl StoreReader { num_skipped = 0; } - let alive = delete_bitset.map_or(true, |bitset| bitset.is_alive(doc_id)); + let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id)); if alive { let ret = Some((curr_block.clone(), num_skipped, reset_block_pos)); // the map block will move over the num_skipped, so we reset to 0 From c27ccd3e241edee74a44b073b1717459e0f96aaf Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 23 Sep 2021 21:02:09 +0800 Subject: [PATCH 09/13] improve naming --- common/src/bitset.rs | 20 ++++++++++---------- src/query/union.rs | 8 ++++++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 527abed9b..84c936a61 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -64,19 +64,19 @@ impl TinySet { } #[inline] - /// Returns true iff the `TinySet` contains the element `el`. - pub fn contains(self, el: u32) -> bool { - !self.intersect(TinySet::singleton(el)).is_empty() + /// Returns true iff the `TinySet` bit is set at position `pos`. + pub fn contains(self, pos: u32) -> bool { + !self.intersect(TinySet::singleton(pos)).is_empty() } #[inline] - /// Returns the number of elements in the TinySet. - pub fn len(self) -> u32 { + /// Returns the number of set bits in the TinySet. + pub fn num_set(self) -> u32 { self.0.count_ones() } #[inline] - /// Returns the number of elements in the TinySet. + /// Returns the number of unset bits in the TinySet. pub fn num_unset(self) -> u32 { self.0.count_zeros() } @@ -87,11 +87,11 @@ impl TinySet { TinySet(self.0 & other.0) } - /// Creates a new `TinySet` containing only one element + /// Creates a new `TinySet` with only one bit set at `pos`. /// within `[0; 64[` #[inline] - pub fn singleton(el: u32) -> TinySet { - TinySet(1u64 << u64::from(el)) + pub fn singleton(pos: u32) -> TinySet { + TinySet(1u64 << u64::from(pos)) } /// Insert a new element within [0..64) @@ -203,7 +203,7 @@ impl BitSet { let mut tinysets = vec![]; for chunk in data.chunks_exact(8) { let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?; - len += tinyset.len() as u64; + len += tinyset.num_set() as u64; tinysets.push(tinyset); } Ok(BitSet { diff --git a/src/query/union.rs b/src/query/union.rs index cf7b4d956..da6da15c0 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -219,14 +219,18 @@ where } let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS] .iter() - .map(|bitset| bitset.len()) + .map(|bitset| bitset.num_set()) .sum::() + 1; for bitset in self.bitsets.iter_mut() { bitset.clear(); } while self.refill() { - count += self.bitsets.iter().map(|bitset| bitset.len()).sum::(); + count += self + .bitsets + .iter() + .map(|bitset| bitset.num_set()) + .sum::(); for bitset in self.bitsets.iter_mut() { bitset.clear(); } From c217bfed1e10155a816fcb15ad5808a3cba9278c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 23 Sep 2021 21:02:19 +0800 Subject: [PATCH 10/13] cargo fmt --- src/indexer/index_writer.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index ff45df9d3..c37d41b0a 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -227,8 +227,7 @@ fn index_documents( let segment_with_max_doc = segment.with_max_doc(max_doc); - let alive_bitset_opt = - apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?; + let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?; let meta = segment_with_max_doc.meta().clone(); meta.untrack_temp_docstore(); From 5ee5037934ec48bb3a06124bb9a8b0e2a0a1990e Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 24 Sep 2021 12:53:33 +0800 Subject: [PATCH 11/13] create and use ReadSerializedBitSet --- common/Cargo.toml | 1 + common/src/bitset.rs | 222 ++++++++++++++++++++++------------ src/core/segment_reader.rs | 4 +- src/docset.rs | 2 +- src/fastfield/alive_bitset.rs | 42 ++++--- src/indexer/index_writer.rs | 6 +- src/indexer/merger.rs | 29 +++-- src/query/bitset/mod.rs | 4 +- src/query/union.rs | 8 +- 9 files changed, 195 insertions(+), 123 deletions(-) diff --git a/common/Cargo.toml b/common/Cargo.toml index 94b40a459..1a6703c1e 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -10,6 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc [dependencies] byteorder = "1.4.3" +ownedbytes = { version="0.1", path="../ownedbytes" } [dev-dependencies] proptest = "1.0.0" diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 84c936a61..a05f786ef 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -1,3 +1,4 @@ +use ownedbytes::OwnedBytes; use std::convert::TryInto; use std::io::Write; use std::u64; @@ -64,34 +65,28 @@ impl TinySet { } #[inline] - /// Returns true iff the `TinySet` bit is set at position `pos`. - pub fn contains(self, pos: u32) -> bool { - !self.intersect(TinySet::singleton(pos)).is_empty() + /// Returns true iff the `TinySet` contains the element `el`. + pub fn contains(self, el: u32) -> bool { + !self.intersect(TinySet::singleton(el)).is_empty() } #[inline] - /// Returns the number of set bits in the TinySet. - pub fn num_set(self) -> u32 { + /// Returns the number of elements in the TinySet. + pub fn len(self) -> u32 { self.0.count_ones() } - #[inline] - /// Returns the number of unset bits in the TinySet. - pub fn num_unset(self) -> u32 { - self.0.count_zeros() - } - #[inline] /// Returns the intersection of `self` and `other` pub fn intersect(self, other: TinySet) -> TinySet { TinySet(self.0 & other.0) } - /// Creates a new `TinySet` with only one bit set at `pos`. + /// Creates a new `TinySet` containing only one element /// within `[0; 64[` #[inline] - pub fn singleton(pos: u32) -> TinySet { - TinySet(1u64 << u64::from(pos)) + pub fn singleton(el: u32) -> TinySet { + TinySet(1u64 << u64::from(el)) } /// Insert a new element within [0..64) @@ -108,7 +103,7 @@ impl TinySet { /// Insert a new element within [0..64) /// - /// returns true if the bit changed + /// returns true if the set changed #[inline] pub fn insert_mut(&mut self, el: u32) -> bool { let old = *self; @@ -116,9 +111,9 @@ impl TinySet { old != *self } - /// Remove a new element within [0..64) + /// Remove a element within [0..64) /// - /// returns true if the bit changed + /// returns true if the set changed #[inline] pub fn remove_mut(&mut self, el: u32) -> bool { let old = *self; @@ -203,7 +198,7 @@ impl BitSet { let mut tinysets = vec![]; for chunk in data.chunks_exact(8) { let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?; - len += tinyset.num_set() as u64; + len += tinyset.len() as u64; tinysets.push(tinyset); } Ok(BitSet { @@ -213,43 +208,6 @@ impl BitSet { }) } - /// Count the number of unset bits from serialized data. - /// - #[inline] - pub fn count_unset_from_bytes<'a>(data: &'a [u8]) -> usize { - BitSet::iter_tinysets_from_bytes(data) - .map(|tinyset| tinyset.num_unset() as usize) - .sum() - } - - /// Iterate the tinyset on the fly from serialized data. - /// - #[inline] - fn iter_tinysets_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { - assert!((data.len() - 4) % 8 == 0); - data[4..].chunks_exact(8).map(move |chunk| { - let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); - tinyset - }) - } - - /// Iterate over the positions of the unset elements. - /// - /// max_val needs to be provided, since the last 64 bits may - #[inline] - pub fn iter_unset_from_bytes<'a>(data: &'a [u8]) -> impl Iterator + 'a { - let max_val: u32 = u32::from_le_bytes(data[..4].try_into().unwrap()); - Self::iter_tinysets_from_bytes(data) - .enumerate() - .flat_map(move |(chunk_num, tinyset)| { - let chunk_base_val = chunk_num as u32 * 64; - tinyset - .into_iter() - .map(move |val| val + chunk_base_val) - .take_while(move |doc| *doc < max_val) - }) - } - /// Create a new `BitSet` that may contain elements /// within `[0, max_val)`. pub fn with_max_value(max_value: u32) -> BitSet { @@ -262,9 +220,9 @@ impl BitSet { } } - /// Create a new `BitSet` that may contain elements + /// Create a new `BitSet` that may contain elements. Initially all values will be set. /// within `[0, max_val)`. - pub fn with_max_value_and_filled(max_value: u32) -> BitSet { + pub fn with_max_value_and_full(max_value: u32) -> BitSet { let num_buckets = num_buckets(max_value); let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice(); BitSet { @@ -282,11 +240,12 @@ impl BitSet { } /// Returns the number of elements in the `BitSet`. - pub fn num_set_bits(&self) -> usize { + pub fn len(&self) -> usize { self.len as usize } /// Inserts an element in the `BitSet` + #[inline] pub fn insert(&mut self, el: u32) { // we do not check saturated els. let higher = el / 64u32; @@ -299,6 +258,7 @@ impl BitSet { } /// Inserts an element in the `BitSet` + #[inline] pub fn remove(&mut self, el: u32) { // we do not check saturated els. let higher = el / 64u32; @@ -312,14 +272,6 @@ impl BitSet { /// Returns true iff the elements is in the `BitSet`. #[inline] - pub fn contains_from_bytes(el: u32, data: &[u8]) -> bool { - let byte_offset = 4 + el / 8u32; - let b: u8 = data[byte_offset as usize]; - let shift = (el % 8) as u8; - b & (1u8 << shift) != 0 - } - - /// Returns true iff the elements is in the `BitSet`. pub fn contains(&self, el: u32) -> bool { self.tinyset(el / 64u32).contains(el % 64) } @@ -349,17 +301,133 @@ impl BitSet { } } +/// Lazy Read a serialized BitSet. +#[derive(Clone)] +pub struct ReadSerializedBitSet { + data: OwnedBytes, + max_value: u32, +} + +impl ReadSerializedBitSet { + pub fn new(data: OwnedBytes) -> Self { + let (max_value_data, data) = data.split(4); + let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap()); + ReadSerializedBitSet { data, max_value } + } + + /// Count the number of unset bits from serialized data. + /// + #[inline] + pub fn count_unset(&self) -> usize { + let lower = self.max_value % 64u32; + + let num_set: usize = self + .iter_tinysets() + .map(|(tinyset, is_last)| { + if is_last { + tinyset.intersect(TinySet::range_lower(lower)).len() as usize + } else { + tinyset.len() as usize + } + }) + .sum(); + self.max_value as usize - num_set + } + + /// Iterate the tinyset on the fly from serialized data. + /// + /// Iterator returns (TinySet, is_last) element, so the consumer can ignore up to max_doc in the + /// last block. + /// + #[inline] + fn iter_tinysets<'a>(&'a self) -> impl Iterator + 'a { + assert!((self.data.len()) % 8 == 0); + self.data + .chunks_exact(8) + .enumerate() + .map(move |(chunk_num, chunk)| { + let is_last = (chunk_num + 1) * 8 == self.data.len(); + + let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + (tinyset, is_last) + }) + } + + /// Iterate over the positions of the unset elements. + /// + #[inline] + pub fn iter_unset<'a>(&'a self) -> impl Iterator + 'a { + self.iter_tinysets() + .enumerate() + .flat_map(move |(chunk_num, (tinyset, _))| { + let chunk_base_val = chunk_num as u32 * 64; + tinyset + .into_iter() + .map(move |val| val + chunk_base_val) + .take_while(move |doc| *doc < self.max_value) + }) + } + + /// Returns true iff the elements is in the `BitSet`. + #[inline] + pub fn contains(&self, el: u32) -> bool { + let byte_offset = el / 8u32; + let b: u8 = self.data[byte_offset as usize]; + let shift = (el % 8) as u8; + b & (1u8 << shift) != 0 + } + + /// Returns the max_value. + #[inline] + pub fn max_value(&self) -> u32 { + self.max_value + } +} + #[cfg(test)] mod tests { use super::BitSet; + use super::ReadSerializedBitSet; use super::TinySet; + use ownedbytes::OwnedBytes; use rand::distributions::Bernoulli; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use std::collections::HashSet; use std::convert::TryInto; + #[test] + fn test_read_serialized_bitset_full() { + let mut bitset = BitSet::with_max_value_and_full(5); + bitset.remove(3); + let mut out = vec![]; + bitset.serialize(&mut out).unwrap(); + + let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + assert_eq!(bitset.count_unset(), 1); + } + + #[test] + fn test_read_serialized_bitset_empty() { + let mut bitset = BitSet::with_max_value(5); + bitset.insert(3); + let mut out = vec![]; + bitset.serialize(&mut out).unwrap(); + + let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + assert_eq!(bitset.count_unset(), 4); + + { + let bitset = BitSet::with_max_value(5); + let mut out = vec![]; + bitset.serialize(&mut out).unwrap(); + + let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + assert_eq!(bitset.count_unset(), 5); + } + } + #[test] fn test_tiny_set_remove() { { @@ -452,7 +520,7 @@ mod tests { assert_eq!(hashset.contains(&el), bitset.contains(el)); } assert_eq!(bitset.max_value(), max_value); - assert_eq!(bitset.num_set_bits(), els.len()); + assert_eq!(bitset.len(), els.len()); }; test_against_hashset(&[], 0); @@ -506,25 +574,25 @@ mod tests { #[test] fn test_bitset_len() { let mut bitset = BitSet::with_max_value(1_000); - assert_eq!(bitset.num_set_bits(), 0); + assert_eq!(bitset.len(), 0); bitset.insert(3u32); - assert_eq!(bitset.num_set_bits(), 1); + assert_eq!(bitset.len(), 1); bitset.insert(103u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.insert(3u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.insert(103u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.insert(104u32); - assert_eq!(bitset.num_set_bits(), 3); + assert_eq!(bitset.len(), 3); bitset.remove(105u32); - assert_eq!(bitset.num_set_bits(), 3); + assert_eq!(bitset.len(), 3); bitset.remove(104u32); - assert_eq!(bitset.num_set_bits(), 2); + assert_eq!(bitset.len(), 2); bitset.remove(3u32); - assert_eq!(bitset.num_set_bits(), 1); + assert_eq!(bitset.len(), 1); bitset.remove(103u32); - assert_eq!(bitset.num_set_bits(), 0); + assert_eq!(bitset.len(), 0); } pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 8ec303b6e..b81155646 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -73,7 +73,7 @@ impl SegmentReader { /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { self.alive_bitset() - .map(|delete_set| delete_set.num_deleted() as DocId) + .map(|alive_set| alive_set.num_deleted() as DocId) .unwrap_or(0u32) } @@ -289,7 +289,7 @@ impl SegmentReader { /// Returns an iterator that will iterate over the alive document ids pub fn doc_ids_alive(&self) -> Box + '_> { if let Some(alive_bitset) = &self.alive_bitset_opt { - Box::new(alive_bitset.iter_unset()) + Box::new(alive_bitset.iter_alive()) } else { Box::new(0u32..self.max_doc) } diff --git a/src/docset.rs b/src/docset.rs index 0df231e23..e5430b207 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -89,7 +89,7 @@ pub trait DocSet: Send { let mut count = 0u32; let mut doc = self.doc(); while doc != TERMINATED { - if !alive_bitset.is_deleted(doc) { + if alive_bitset.is_alive(doc) { count += 1u32; } doc = self.advance(); diff --git a/src/fastfield/alive_bitset.rs b/src/fastfield/alive_bitset.rs index 3f8136c23..108eb24eb 100644 --- a/src/fastfield/alive_bitset.rs +++ b/src/fastfield/alive_bitset.rs @@ -3,6 +3,7 @@ use crate::directory::OwnedBytes; use crate::space_usage::ByteCount; use crate::DocId; use common::BitSet; +use common::ReadSerializedBitSet; use std::io; use std::io::Write; @@ -21,16 +22,17 @@ pub fn write_alive_bitset(alive_bitset: &BitSet, writer: &mut T) -> io pub struct AliveBitSet { data: OwnedBytes, num_deleted: usize, + bitset: ReadSerializedBitSet, } impl AliveBitSet { #[cfg(test)] - pub(crate) fn for_test(not_alive_docs: &[DocId], max_doc: u32) -> AliveBitSet { + pub(crate) fn for_test(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet { use crate::directory::{Directory, RamDirectory, TerminatingWrite}; use std::path::Path; - assert!(not_alive_docs.iter().all(|&doc| doc < max_doc)); - let mut bitset = BitSet::with_max_value_and_filled(max_doc); - for &doc in not_alive_docs { + assert!(deleted_docs.iter().all(|&doc| doc < max_doc)); + let mut bitset = BitSet::with_max_value_and_full(max_doc); + for &doc in deleted_docs { bitset.remove(doc); } let directory = RamDirectory::create(); @@ -45,32 +47,38 @@ impl AliveBitSet { /// Opens a delete bitset given its file. pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let num_deleted = BitSet::count_unset_from_bytes(bytes.as_slice()); + let bitset = ReadSerializedBitSet::new(bytes.clone()); + let num_deleted = bitset.count_unset(); Ok(AliveBitSet { data: bytes, num_deleted, + bitset, }) } /// Returns true iff the document is still "alive". In other words, if it has not been deleted. #[inline] pub fn is_alive(&self, doc: DocId) -> bool { - !self.is_deleted(doc) + self.bitset.contains(doc) } /// Returns true iff the document has been marked as deleted. #[inline] pub fn is_deleted(&self, doc: DocId) -> bool { - let data = self.data.as_slice(); - !BitSet::contains_from_bytes(doc, data) + !self.is_alive(doc) } - /// Iterate over the positions of the set elements + /// Iterate over the alive docids. #[inline] - pub fn iter_unset(&self) -> impl Iterator + '_ { - let data = self.data.as_slice(); - BitSet::iter_unset_from_bytes(data) + pub fn iter_alive(&self) -> impl Iterator + '_ { + self.bitset.iter_unset() + } + + /// Get underlying bitset + #[inline] + pub fn bitset(&self) -> &ReadSerializedBitSet { + &self.bitset } /// The number of deleted docs @@ -121,7 +129,7 @@ mod tests { fn test_alive_bitset_iter_minimal() { let alive_bitset = AliveBitSet::for_test(&[7], 8); - let data: Vec<_> = alive_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]); } @@ -129,14 +137,14 @@ mod tests { fn test_alive_bitset_iter_small() { let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7); - let data: Vec<_> = alive_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, vec![1, 4, 5]); } #[test] fn test_alive_bitset_iter() { let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001); - let data: Vec<_> = alive_bitset.iter_unset().collect(); + let data: Vec<_> = alive_bitset.iter_alive().collect(); assert_eq!(data, (2..=999).collect::>()); } } @@ -166,7 +174,7 @@ mod bench { fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000); - bench.iter(|| alive_bitset.iter_unset().collect::>()); + bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] @@ -184,7 +192,7 @@ mod bench { fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) { let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000); - bench.iter(|| alive_bitset.iter_unset().collect::>()); + bench.iter(|| alive_bitset.iter_alive().collect::>()); } #[bench] diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index c37d41b0a..6250db86a 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -151,7 +151,7 @@ pub(crate) fn advance_deletes( let max_doc = segment_reader.max_doc(); let mut alive_bitset: BitSet = match segment_entry.alive_bitset() { Some(previous_alive_bitset) => (*previous_alive_bitset).clone(), - None => BitSet::with_max_value_and_filled(max_doc), + None => BitSet::with_max_value_and_full(max_doc), }; let num_deleted_docs_before = segment.meta().num_deleted_docs(); @@ -175,7 +175,7 @@ pub(crate) fn advance_deletes( } } - let num_alive_docs: u32 = alive_bitset.num_set_bits() as u32; + let num_alive_docs: u32 = alive_bitset.len() as u32; let num_deleted_docs = max_doc - num_alive_docs; if num_deleted_docs > num_deleted_docs_before { // There are new deletes. We need to write a new delete file. @@ -259,7 +259,7 @@ fn apply_deletes( let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps); let max_doc = segment.meta().max_doc(); - let mut deleted_bitset = BitSet::with_max_value_and_filled(max_doc); + let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc); let may_have_deletes = compute_deleted_bitset( &mut deleted_bitset, &segment_reader, diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 84151c8b1..8932c6790 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -99,22 +99,21 @@ fn compute_min_max_val( segment_reader: &SegmentReader, ) -> Option<(u64, u64)> { if segment_reader.max_doc() == 0 { - None - } else { - if segment_reader.alive_bitset().is_some() { - // some deleted documents, - // we need to recompute the max / min - minmax( - segment_reader - .doc_ids_alive() - .map(|doc_id| u64_reader.get(doc_id)), - ) - } else { - // no deleted documents, - // we can use the previous min_val, max_val. - Some((u64_reader.min_value(), u64_reader.max_value())) - } + return None; } + + if segment_reader.alive_bitset().is_none() { + // no deleted documents, + // we can use the previous min_val, max_val. + return Some((u64_reader.min_value(), u64_reader.max_value())); + } + // some deleted documents, + // we need to recompute the max / min + minmax( + segment_reader + .doc_ids_alive() + .map(|doc_id| u64_reader.get(doc_id)), + ) } struct TermOrdinalMapping { diff --git a/src/query/bitset/mod.rs b/src/query/bitset/mod.rs index ebd6e7b36..030fdeae7 100644 --- a/src/query/bitset/mod.rs +++ b/src/query/bitset/mod.rs @@ -90,7 +90,7 @@ impl DocSet for BitSetDocSet { /// but we don't have access to any better /// value. fn size_hint(&self) -> u32 { - self.docs.num_set_bits() as u32 + self.docs.len() as u32 } } @@ -124,7 +124,7 @@ mod tests { for i in 0..100_000 { assert_eq!(btreeset.contains(&i), bitset.contains(i)); } - assert_eq!(btreeset.len(), bitset.num_set_bits()); + assert_eq!(btreeset.len(), bitset.len()); let mut bitset_docset = BitSetDocSet::from(bitset); let mut remaining = true; for el in btreeset.into_iter() { diff --git a/src/query/union.rs b/src/query/union.rs index da6da15c0..cf7b4d956 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -219,18 +219,14 @@ where } let mut count = self.bitsets[self.cursor..HORIZON_NUM_TINYBITSETS] .iter() - .map(|bitset| bitset.num_set()) + .map(|bitset| bitset.len()) .sum::() + 1; for bitset in self.bitsets.iter_mut() { bitset.clear(); } while self.refill() { - count += self - .bitsets - .iter() - .map(|bitset| bitset.num_set()) - .sum::(); + count += self.bitsets.iter().map(|bitset| bitset.len()).sum::(); for bitset in self.bitsets.iter_mut() { bitset.clear(); } From 22bcc83d106ee77b666c9b1a812aee6be4e38a9b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 24 Sep 2021 14:43:04 +0800 Subject: [PATCH 12/13] fix padding in initialization --- common/src/bitset.rs | 51 ++++++++++++++++------------------- src/fastfield/alive_bitset.rs | 4 +-- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index a05f786ef..8b024e3d4 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -60,7 +60,10 @@ impl TinySet { #[inline] /// Returns the complement of the set in `[0, 64[`. - pub fn complement(self) -> TinySet { + /// + /// Careful on making this function public, as it will break the padding handling in the last + /// bucket. + fn complement(self) -> TinySet { TinySet(!self.0) } @@ -224,7 +227,12 @@ impl BitSet { /// within `[0, max_val)`. pub fn with_max_value_and_full(max_value: u32) -> BitSet { let num_buckets = num_buckets(max_value); - let tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice(); + let mut tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice(); + + // Fix padding + let lower = max_value % 64u32; + tinybisets[tinybisets.len() - 1] = TinySet::range_lower(lower); + BitSet { tinysets: tinybisets, len: max_value as u64, @@ -309,7 +317,7 @@ pub struct ReadSerializedBitSet { } impl ReadSerializedBitSet { - pub fn new(data: OwnedBytes) -> Self { + pub fn open(data: OwnedBytes) -> Self { let (max_value_data, data) = data.split(4); let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap()); ReadSerializedBitSet { data, max_value } @@ -319,17 +327,9 @@ impl ReadSerializedBitSet { /// #[inline] pub fn count_unset(&self) -> usize { - let lower = self.max_value % 64u32; - let num_set: usize = self .iter_tinysets() - .map(|(tinyset, is_last)| { - if is_last { - tinyset.intersect(TinySet::range_lower(lower)).len() as usize - } else { - tinyset.len() as usize - } - }) + .map(|tinyset| tinyset.len() as usize) .sum(); self.max_value as usize - num_set } @@ -340,26 +340,21 @@ impl ReadSerializedBitSet { /// last block. /// #[inline] - fn iter_tinysets<'a>(&'a self) -> impl Iterator + 'a { + fn iter_tinysets<'a>(&'a self) -> impl Iterator + 'a { assert!((self.data.len()) % 8 == 0); - self.data - .chunks_exact(8) - .enumerate() - .map(move |(chunk_num, chunk)| { - let is_last = (chunk_num + 1) * 8 == self.data.len(); - - let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); - (tinyset, is_last) - }) + self.data.chunks_exact(8).map(move |chunk| { + let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap(); + tinyset + }) } - /// Iterate over the positions of the unset elements. + /// Iterate over the positions of the elements. /// #[inline] - pub fn iter_unset<'a>(&'a self) -> impl Iterator + 'a { + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { self.iter_tinysets() .enumerate() - .flat_map(move |(chunk_num, (tinyset, _))| { + .flat_map(move |(chunk_num, tinyset)| { let chunk_base_val = chunk_num as u32 * 64; tinyset .into_iter() @@ -404,7 +399,7 @@ mod tests { let mut out = vec![]; bitset.serialize(&mut out).unwrap(); - let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out)); assert_eq!(bitset.count_unset(), 1); } @@ -415,7 +410,7 @@ mod tests { let mut out = vec![]; bitset.serialize(&mut out).unwrap(); - let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out)); assert_eq!(bitset.count_unset(), 4); { @@ -423,7 +418,7 @@ mod tests { let mut out = vec![]; bitset.serialize(&mut out).unwrap(); - let bitset = ReadSerializedBitSet::new(OwnedBytes::new(out)); + let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out)); assert_eq!(bitset.count_unset(), 5); } } diff --git a/src/fastfield/alive_bitset.rs b/src/fastfield/alive_bitset.rs index 108eb24eb..0ab4513a2 100644 --- a/src/fastfield/alive_bitset.rs +++ b/src/fastfield/alive_bitset.rs @@ -47,7 +47,7 @@ impl AliveBitSet { /// Opens a delete bitset given its file. pub fn open(file: FileSlice) -> crate::Result { let bytes = file.read_bytes()?; - let bitset = ReadSerializedBitSet::new(bytes.clone()); + let bitset = ReadSerializedBitSet::open(bytes.clone()); let num_deleted = bitset.count_unset(); Ok(AliveBitSet { @@ -72,7 +72,7 @@ impl AliveBitSet { /// Iterate over the alive docids. #[inline] pub fn iter_alive(&self) -> impl Iterator + '_ { - self.bitset.iter_unset() + self.bitset.iter() } /// Get underlying bitset From efc0d8341bc97b2a300b65a21502ec91145d5965 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 24 Sep 2021 15:09:21 +0800 Subject: [PATCH 13/13] fix comment --- common/src/bitset.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/common/src/bitset.rs b/common/src/bitset.rs index 8b024e3d4..6ff97e4a3 100644 --- a/common/src/bitset.rs +++ b/common/src/bitset.rs @@ -336,9 +336,6 @@ impl ReadSerializedBitSet { /// Iterate the tinyset on the fly from serialized data. /// - /// Iterator returns (TinySet, is_last) element, so the consumer can ignore up to max_doc in the - /// last block. - /// #[inline] fn iter_tinysets<'a>(&'a self) -> impl Iterator + 'a { assert!((self.data.len()) % 8 == 0);