From 5ef2d56ec23ea2557c1a514d4030f67b5cc09031 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Mon, 24 May 2021 08:38:49 +0200 Subject: [PATCH] Avoid docstore stacking for small segments, fixes #1053 (#1055) --- src/indexer/merger.rs | 17 +++++++++++++++- src/store/mod.rs | 45 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b1ef2033c..93b11120b 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1004,7 +1004,22 @@ impl IndexMerger { } else { for reader in &self.readers { let store_reader = reader.get_store_reader()?; - if reader.num_deleted_docs() > 0 { + if reader.num_deleted_docs() > 0 + // If there is not enough data in the store, we avoid stacking in order to + // avoid creating many small blocks in the doc store. Once we have 5 full blocks, + // we start stacking. In the worst case 2/7 of the blocks would be very small. + // [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}] + // => 5 * full blocks, 2 * 1 document blocks + // + // In a more realistic scenario the segments are of the same size, so 1/6 of + // the doc stores would be on average half full, given total randomness (which + // is not the case here, but not sure how it behaves exactly). + // + // https://github.com/tantivy-search/tantivy/issues/1053 + // + // take 7 in order to not walk over all checkpoints. + || store_reader.block_checkpoints().take(7).count() < 6 + { for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) { let doc_bytes = doc_bytes_res?; store_writer.store_bytes(&doc_bytes)?; diff --git a/src/store/mod.rs b/src/store/mod.rs index d9652c0af..2f04dcd46 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -97,8 +97,10 @@ use self::compression_snap::{compress, decompress}; #[cfg(test)] pub mod tests { + use futures::executor::block_on; + use super::*; - use crate::schema::{self, FieldValue, TextFieldIndexing}; + use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT}; use crate::schema::{Document, TextOptions}; use crate::{ directory::{Directory, RamDirectory, WritePtr}, @@ -214,6 +216,47 @@ pub mod tests { } Ok(()) } + #[test] + fn test_merge_of_small_segments() -> crate::Result<()> { + let mut schema_builder = schema::Schema::builder(); + + let text_field = schema_builder.add_text_field("text_field", TEXT | STORED); + let schema = schema_builder.build(); + let index_builder = Index::builder().schema(schema); + + let index = index_builder.create_in_ram().unwrap(); + + { + let mut index_writer = index.writer_for_tests().unwrap(); + + index_writer.add_document(doc!(text_field=> "1")); + assert!(index_writer.commit().is_ok()); + index_writer.add_document(doc!(text_field=> "2")); + assert!(index_writer.commit().is_ok()); + index_writer.add_document(doc!(text_field=> "3")); + assert!(index_writer.commit().is_ok()); + index_writer.add_document(doc!(text_field=> "4")); + assert!(index_writer.commit().is_ok()); + index_writer.add_document(doc!(text_field=> "5")); + assert!(index_writer.commit().is_ok()); + } + // Merging the segments + { + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + let mut index_writer = index.writer_for_tests().unwrap(); + assert!(block_on(index_writer.merge(&segment_ids)).is_ok()); + assert!(index_writer.wait_merging_threads().is_ok()); + } + + let searcher = index.reader().unwrap().searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + let reader = searcher.segment_readers().iter().last().unwrap(); + let store = reader.get_store_reader().unwrap(); + assert_eq!(store.block_checkpoints().count(), 1); + Ok(()) + } } #[cfg(all(test, feature = "unstable"))]