Leaning more on the alive (vs delete) semantics. (#1164)

This commit is contained in:
Paul Masurel
2021-10-05 18:53:29 +09:00
committed by GitHub
parent d828e58903
commit 0855649986
8 changed files with 62 additions and 68 deletions

View File

@@ -309,7 +309,7 @@ impl BitSet {
}
}
/// Lazy Read a serialized BitSet.
/// Serialized BitSet.
#[derive(Clone)]
pub struct ReadSerializedBitSet {
data: OwnedBytes,
@@ -323,15 +323,12 @@ impl ReadSerializedBitSet {
ReadSerializedBitSet { data, max_value }
}
/// Count the number of unset bits from serialized data.
///
/// Number of elements in the bitset.
#[inline]
pub fn count_unset(&self) -> usize {
let num_set: usize = self
.iter_tinysets()
pub fn len(&self) -> usize {
self.iter_tinysets()
.map(|tinyset| tinyset.len() as usize)
.sum();
self.max_value as usize - num_set
.sum()
}
/// Iterate the tinyset on the fly from serialized data.
@@ -369,7 +366,11 @@ impl ReadSerializedBitSet {
b & (1u8 << shift) != 0
}
/// Returns the max_value.
/// Maximum value the bitset may contain.
/// (Note this is not the maximum value contained in the set.)
///
/// A bitset has an intrinsic capacity.
/// It only stores elements within [0..max_value).
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
@@ -397,7 +398,7 @@ mod tests {
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.count_unset(), 1);
assert_eq!(bitset.len(), 4);
}
#[test]
@@ -408,15 +409,14 @@ mod tests {
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.count_unset(), 4);
assert_eq!(bitset.len(), 1);
{
let bitset = BitSet::with_max_value(5);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.count_unset(), 5);
assert_eq!(bitset.len(), 0);
}
}

View File

@@ -72,14 +72,12 @@ impl SegmentReader {
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.alive_bitset()
.map(|alive_set| alive_set.num_deleted() as DocId)
.unwrap_or(0u32)
self.max_doc - self.num_docs
}
/// Returns true iff some of the documents of the segment have been deleted.
pub fn has_deletes(&self) -> bool {
self.alive_bitset().is_some()
self.num_deleted_docs() > 0
}
/// Accessor to a segment's fast field reader given a field.
@@ -171,8 +169,8 @@ impl SegmentReader {
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let alive_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::Delete)?;
let alive_bitset = AliveBitSet::open(delete_data)?;
let alive_bitset_bytes = segment.open_read(SegmentComponent::Delete)?.read_bytes()?;
let alive_bitset = AliveBitSet::open(alive_bitset_bytes);
Some(alive_bitset)
} else {
None

View File

@@ -1,9 +1,8 @@
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::space_usage::ByteCount;
use crate::DocId;
use common::BitSet;
use common::ReadSerializedBitSet;
use ownedbytes::OwnedBytes;
use std::io;
use std::io::Write;
@@ -20,41 +19,34 @@ pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io
/// Set of alive `DocId`s.
#[derive(Clone)]
pub struct AliveBitSet {
data: OwnedBytes,
num_deleted: usize,
num_alive_docs: usize,
bitset: ReadSerializedBitSet,
num_bytes: ByteCount,
}
impl AliveBitSet {
#[cfg(test)]
pub(crate) fn for_test(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
use std::path::Path;
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value_and_full(max_doc);
for &doc in deleted_docs {
bitset.remove(doc);
}
let directory = RamDirectory::create();
let path = Path::new("dummydeletebitset");
let mut wrt = directory.open_write(path).unwrap();
write_alive_bitset(&bitset, &mut wrt).unwrap();
wrt.terminate().unwrap();
let file = directory.open_read(path).unwrap();
Self::open(file).unwrap()
let mut alive_bitset_buffer = Vec::new();
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
Self::open(alive_bitset_bytes)
}
/// Opens a delete bitset given its file.
pub fn open(file: FileSlice) -> crate::Result<AliveBitSet> {
let bytes = file.read_bytes()?;
let bitset = ReadSerializedBitSet::open(bytes.clone());
let num_deleted = bitset.count_unset();
Ok(AliveBitSet {
data: bytes,
num_deleted,
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
let num_bytes = bytes.len();
let bitset = ReadSerializedBitSet::open(bytes);
AliveBitSet {
num_alive_docs: bitset.len(),
bitset,
})
num_bytes,
}
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
@@ -82,12 +74,13 @@ impl AliveBitSet {
}
/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
pub fn num_alive_docs(&self) -> usize {
self.num_alive_docs
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()
self.num_bytes
}
}
@@ -98,16 +91,17 @@ mod tests {
#[test]
fn test_alive_bitset_empty() {
let alive_bitset = AliveBitSet::for_test(&[], 10);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
assert!(!alive_bitset.is_deleted(doc));
}
assert_eq!(alive_bitset.num_deleted(), 0);
assert_eq!(alive_bitset.num_alive_docs(), 10);
}
#[test]
fn test_alive_bitset() {
let alive_bitset = AliveBitSet::for_test(&[1, 9], 10);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
assert!(alive_bitset.is_alive(0));
assert!(alive_bitset.is_deleted(1));
assert!(alive_bitset.is_alive(2));
@@ -122,12 +116,12 @@ mod tests {
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
}
assert_eq!(alive_bitset.num_deleted(), 2);
assert_eq!(alive_bitset.num_alive_docs(), 8);
}
#[test]
fn test_alive_bitset_iter_minimal() {
let alive_bitset = AliveBitSet::for_test(&[7], 8);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
@@ -135,14 +129,14 @@ mod tests {
#[test]
fn test_alive_bitset_iter_small() {
let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![1, 4, 5]);
}
#[test]
fn test_alive_bitset_iter() {
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, (2..=999).collect::<Vec<_>>());
@@ -172,14 +166,14 @@ mod bench {
#[bench]
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
@@ -190,14 +184,14 @@ mod bench {
#[bench]
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)

View File

@@ -854,7 +854,7 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 2);
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
assert!(index_writer.commit().is_ok());
@@ -862,7 +862,7 @@ mod tests {
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
@@ -874,7 +874,7 @@ mod tests {
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
assert_eq!(after_delete_opstamp, previous_delete_opstamp);
@@ -1644,7 +1644,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.max_doc(), 2);
assert_eq!(segment_reader.num_deleted_docs(), 1);
assert_eq!(segment_reader.num_docs(), 1);
Ok(())
}

View File

@@ -479,7 +479,7 @@ impl IndexMerger {
let mut num_docs = 0;
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
if let Some(alive_bitset) = reader.alive_bitset() {
num_docs += reader.max_doc() as u64 - alive_bitset.num_deleted() as u64;
num_docs += alive_bitset.num_alive_docs() as u64;
for doc in reader.doc_ids_alive() {
let num_vals = u64s_reader.get_len(doc) as u64;
total_num_vals += num_vals;
@@ -1012,7 +1012,7 @@ impl IndexMerger {
} else {
for reader in &self.readers {
let store_reader = reader.get_store_reader()?;
if reader.num_deleted_docs() > 0
if reader.has_deletes()
// If there is not enough data in the store, we avoid stacking in order to
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
// we start stacking. In the worst case 2/7 of the blocks would be very small.

View File

@@ -257,7 +257,7 @@ mod tests {
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
@@ -336,7 +336,7 @@ mod tests {
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
@@ -446,7 +446,7 @@ mod tests {
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)

View File

@@ -296,9 +296,10 @@ mod tests {
fn test_doc_freq() {
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
assert_eq!(docs.doc_freq(), 3);
let alive_bitset = AliveBitSet::for_test(&[2], 12);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2);
let all_deleted = AliveBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
let all_deleted =
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
}
}

View File

@@ -113,7 +113,8 @@ pub mod tests {
fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> {
// this will cover deletion of the first element in a checkpoint
let deleted_docids = (200..300).collect::<Vec<_>>();
let alive_bitset = AliveBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
let alive_bitset =
AliveBitSet::for_test_from_deleted_docs(&deleted_docids, NUM_DOCS as u32);
let path = Path::new("store");
let directory = RamDirectory::create();