Avoid docstore stacking for small segments, fixes #1053 (#1055)

This commit is contained in:
PSeitz
2021-05-24 08:38:49 +02:00
committed by GitHub
parent fd8e5bdf57
commit 5ef2d56ec2
2 changed files with 60 additions and 2 deletions

View File

@@ -1004,7 +1004,22 @@ impl IndexMerger {
} else {
for reader in &self.readers {
let store_reader = reader.get_store_reader()?;
if reader.num_deleted_docs() > 0 {
if reader.num_deleted_docs() > 0
// If there is not enough data in the store, we avoid stacking in order to
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
// we start stacking. In the worst case 2/7 of the blocks would be very small.
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
// => 5 * full blocks, 2 * 1 document blocks
//
// In a more realistic scenario the segments are of the same size, so 1/6 of
// the doc stores would be on average half full, given total randomness (which
// is not the case here, but not sure how it behaves exactly).
//
// https://github.com/tantivy-search/tantivy/issues/1053
//
// take 7 in order to not walk over all checkpoints.
|| store_reader.block_checkpoints().take(7).count() < 6
{
for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;

View File

@@ -97,8 +97,10 @@ use self::compression_snap::{compress, decompress};
#[cfg(test)]
pub mod tests {
use futures::executor::block_on;
use super::*;
use crate::schema::{self, FieldValue, TextFieldIndexing};
use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT};
use crate::schema::{Document, TextOptions};
use crate::{
directory::{Directory, RamDirectory, WritePtr},
@@ -214,6 +216,47 @@ pub mod tests {
}
Ok(())
}
#[test]
fn test_merge_of_small_segments() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text_field", TEXT | STORED);
let schema = schema_builder.build();
let index_builder = Index::builder().schema(schema);
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=> "1"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "2"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "3"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "4"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "5"));
assert!(index_writer.commit().is_ok());
}
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let reader = searcher.segment_readers().iter().last().unwrap();
let store = reader.get_store_reader().unwrap();
assert_eq!(store.block_checkpoints().count(), 1);
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]