mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-10 19:12:54 +00:00
Leaning more on the alive (vs delete) semantics. (#1164)
This commit is contained in:
@@ -309,7 +309,7 @@ impl BitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Lazy Read a serialized BitSet.
|
||||
/// Serialized BitSet.
|
||||
#[derive(Clone)]
|
||||
pub struct ReadSerializedBitSet {
|
||||
data: OwnedBytes,
|
||||
@@ -323,15 +323,12 @@ impl ReadSerializedBitSet {
|
||||
ReadSerializedBitSet { data, max_value }
|
||||
}
|
||||
|
||||
/// Count the number of unset bits from serialized data.
|
||||
///
|
||||
/// Number of elements in the bitset.
|
||||
#[inline]
|
||||
pub fn count_unset(&self) -> usize {
|
||||
let num_set: usize = self
|
||||
.iter_tinysets()
|
||||
pub fn len(&self) -> usize {
|
||||
self.iter_tinysets()
|
||||
.map(|tinyset| tinyset.len() as usize)
|
||||
.sum();
|
||||
self.max_value as usize - num_set
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
@@ -369,7 +366,11 @@ impl ReadSerializedBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Returns the max_value.
|
||||
/// Maximum value the bitset may contain.
|
||||
/// (Note this is not the maximum value contained in the set.)
|
||||
///
|
||||
/// A bitset has an intrinsic capacity.
|
||||
/// It only stores elements within [0..max_value).
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
@@ -397,7 +398,7 @@ mod tests {
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.count_unset(), 1);
|
||||
assert_eq!(bitset.len(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -408,15 +409,14 @@ mod tests {
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.count_unset(), 4);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
|
||||
{
|
||||
let bitset = BitSet::with_max_value(5);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.count_unset(), 5);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -72,14 +72,12 @@ impl SegmentReader {
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.alive_bitset()
|
||||
.map(|alive_set| alive_set.num_deleted() as DocId)
|
||||
.unwrap_or(0u32)
|
||||
self.max_doc - self.num_docs
|
||||
}
|
||||
|
||||
/// Returns true iff some of the documents of the segment have been deleted.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.alive_bitset().is_some()
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
@@ -171,8 +169,8 @@ impl SegmentReader {
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let alive_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::Delete)?;
|
||||
let alive_bitset = AliveBitSet::open(delete_data)?;
|
||||
let alive_bitset_bytes = segment.open_read(SegmentComponent::Delete)?.read_bytes()?;
|
||||
let alive_bitset = AliveBitSet::open(alive_bitset_bytes);
|
||||
Some(alive_bitset)
|
||||
} else {
|
||||
None
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use common::BitSet;
|
||||
use common::ReadSerializedBitSet;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -20,41 +19,34 @@ pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io
|
||||
/// Set of alive `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct AliveBitSet {
|
||||
data: OwnedBytes,
|
||||
num_deleted: usize,
|
||||
num_alive_docs: usize,
|
||||
bitset: ReadSerializedBitSet,
|
||||
num_bytes: ByteCount,
|
||||
}
|
||||
|
||||
impl AliveBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
|
||||
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
|
||||
use std::path::Path;
|
||||
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
|
||||
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value_and_full(max_doc);
|
||||
for &doc in deleted_docs {
|
||||
bitset.remove(doc);
|
||||
}
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("dummydeletebitset");
|
||||
let mut wrt = directory.open_write(path).unwrap();
|
||||
write_alive_bitset(&bitset, &mut wrt).unwrap();
|
||||
wrt.terminate().unwrap();
|
||||
let file = directory.open_read(path).unwrap();
|
||||
Self::open(file).unwrap()
|
||||
let mut alive_bitset_buffer = Vec::new();
|
||||
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
|
||||
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
|
||||
Self::open(alive_bitset_bytes)
|
||||
}
|
||||
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<AliveBitSet> {
|
||||
let bytes = file.read_bytes()?;
|
||||
let bitset = ReadSerializedBitSet::open(bytes.clone());
|
||||
let num_deleted = bitset.count_unset();
|
||||
|
||||
Ok(AliveBitSet {
|
||||
data: bytes,
|
||||
num_deleted,
|
||||
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
|
||||
let num_bytes = bytes.len();
|
||||
let bitset = ReadSerializedBitSet::open(bytes);
|
||||
AliveBitSet {
|
||||
num_alive_docs: bitset.len(),
|
||||
bitset,
|
||||
})
|
||||
num_bytes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
@@ -82,12 +74,13 @@ impl AliveBitSet {
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
pub fn num_deleted(&self) -> usize {
|
||||
self.num_deleted
|
||||
pub fn num_alive_docs(&self) -> usize {
|
||||
self.num_alive_docs
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
self.num_bytes
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,16 +91,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_empty() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[], 10);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
|
||||
for doc in 0..10 {
|
||||
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
|
||||
assert!(!alive_bitset.is_deleted(doc));
|
||||
}
|
||||
assert_eq!(alive_bitset.num_deleted(), 0);
|
||||
assert_eq!(alive_bitset.num_alive_docs(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[1, 9], 10);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
|
||||
assert!(alive_bitset.is_alive(0));
|
||||
assert!(alive_bitset.is_deleted(1));
|
||||
assert!(alive_bitset.is_alive(2));
|
||||
@@ -122,12 +116,12 @@ mod tests {
|
||||
for doc in 0..10 {
|
||||
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
|
||||
}
|
||||
assert_eq!(alive_bitset.num_deleted(), 2);
|
||||
assert_eq!(alive_bitset.num_alive_docs(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_iter_minimal() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[7], 8);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
|
||||
@@ -135,14 +129,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_iter_small() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[0, 2, 3, 6], 7);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![1, 4, 5]);
|
||||
}
|
||||
#[test]
|
||||
fn test_alive_bitset_iter() {
|
||||
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000], 1001);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, (2..=999).collect::<Vec<_>>());
|
||||
@@ -172,14 +166,14 @@ mod bench {
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test(&[0, 1, 1000, 10000], 1_000_000);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
@@ -190,14 +184,14 @@ mod bench {
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test(&get_alive(), 1_000_000);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
|
||||
@@ -854,7 +854,7 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_docs(), 2);
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
@@ -862,7 +862,7 @@ mod tests {
|
||||
assert!(reader.reload().is_ok());
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
|
||||
|
||||
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
|
||||
|
||||
@@ -874,7 +874,7 @@ mod tests {
|
||||
assert!(reader.reload().is_ok());
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
|
||||
|
||||
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
|
||||
assert_eq!(after_delete_opstamp, previous_delete_opstamp);
|
||||
@@ -1644,7 +1644,7 @@ mod tests {
|
||||
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
assert_eq!(segment_reader.max_doc(), 2);
|
||||
assert_eq!(segment_reader.num_deleted_docs(), 1);
|
||||
assert_eq!(segment_reader.num_docs(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -479,7 +479,7 @@ impl IndexMerger {
|
||||
let mut num_docs = 0;
|
||||
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
|
||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
||||
num_docs += reader.max_doc() as u64 - alive_bitset.num_deleted() as u64;
|
||||
num_docs += alive_bitset.num_alive_docs() as u64;
|
||||
for doc in reader.doc_ids_alive() {
|
||||
let num_vals = u64s_reader.get_len(doc) as u64;
|
||||
total_num_vals += num_vals;
|
||||
@@ -1012,7 +1012,7 @@ impl IndexMerger {
|
||||
} else {
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader()?;
|
||||
if reader.num_deleted_docs() > 0
|
||||
if reader.has_deletes()
|
||||
// If there is not enough data in the store, we avoid stacking in order to
|
||||
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||
|
||||
@@ -257,7 +257,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
@@ -336,7 +336,7 @@ mod tests {
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
@@ -446,7 +446,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test(&[0], 100);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
|
||||
@@ -296,9 +296,10 @@ mod tests {
|
||||
fn test_doc_freq() {
|
||||
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
|
||||
assert_eq!(docs.doc_freq(), 3);
|
||||
let alive_bitset = AliveBitSet::for_test(&[2], 12);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2);
|
||||
let all_deleted = AliveBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
let all_deleted =
|
||||
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,7 +113,8 @@ pub mod tests {
|
||||
fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> {
|
||||
// this will cover deletion of the first element in a checkpoint
|
||||
let deleted_docids = (200..300).collect::<Vec<_>>();
|
||||
let alive_bitset = AliveBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
|
||||
let alive_bitset =
|
||||
AliveBitSet::for_test_from_deleted_docs(&deleted_docids, NUM_DOCS as u32);
|
||||
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
|
||||
Reference in New Issue
Block a user