LogMergePolicy knob del_docs_percentage_before_merge (#1238)

Add a knob to LogMergePolicy to always merge segments that exceed a threshold of deleted docs

Closes #115
This commit is contained in:
Shikhar Bhushan
2021-12-19 23:14:56 -05:00
committed by GitHub
parent b2da82f151
commit e5e252cbc0
4 changed files with 95 additions and 7 deletions

View File

@@ -5,6 +5,7 @@ Tantivy 0.17
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-inc/tantivy/issues/1224)
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-inc/tantivy/issues/1225)
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-inc/tantivy/issues/922)
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar) [#115](https://github.com/quickwit-inc/tantivy/issues/115)
Tantivy 0.16.2
================================

View File

@@ -189,6 +189,10 @@ impl SegmentMeta {
#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
assert!(
num_deleted_docs <= self.max_doc(),
"There cannot be more deleted docs than there are docs."
);
let delete_meta = DeleteMeta {
num_deleted_docs,
opstamp,

View File

@@ -974,7 +974,7 @@ mod tests {
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
level_log_size: 0.75 }"
level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }"
);
let merge_policy = Box::new(NoMergePolicy::default());
index_writer.set_merge_policy(merge_policy);

View File

@@ -2,12 +2,15 @@ use super::merge_policy::{MergeCandidate, MergePolicy};
use crate::core::SegmentMeta;
use itertools::Itertools;
use std::cmp;
use std::f64;
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
// The default value of 1 means that deletes are not taken in account when
// identifying merge candidates. This is not a very sensible default: it was
// set like that for backward compatibility and might change in the near future.
const DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE: f32 = 1.0f32;
/// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents.
@@ -17,6 +20,7 @@ pub struct LogMergePolicy {
max_docs_before_merge: usize,
min_layer_size: u32,
level_log_size: f64,
del_docs_ratio_before_merge: f32,
}
impl LogMergePolicy {
@@ -52,19 +56,49 @@ impl LogMergePolicy {
pub fn set_level_log_size(&mut self, level_log_size: f64) {
self.level_log_size = level_log_size;
}
/// Set the ratio of deleted documents in a segment to tolerate.
///
/// If it is exceeded by any segment at a log level, a merge
/// will be triggered for that level.
///
/// If there is a single segment at a level, we effectively end up expunging
/// deleted documents from it.
///
/// # Panics
///
/// Panics if del_docs_ratio_before_merge is not within (0..1].
pub fn set_del_docs_ratio_before_merge(&mut self, del_docs_ratio_before_merge: f32) {
assert!(del_docs_ratio_before_merge <= 1.0f32);
assert!(del_docs_ratio_before_merge > 0f32);
self.del_docs_ratio_before_merge = del_docs_ratio_before_merge;
}
fn has_segment_above_deletes_threshold(&self, level: &[&SegmentMeta]) -> bool {
level
.iter()
.any(|segment| deletes_ratio(segment) > self.del_docs_ratio_before_merge)
}
}
fn deletes_ratio(segment: &SegmentMeta) -> f32 {
if segment.max_doc() == 0 {
return 0f32;
}
segment.num_deleted_docs() as f32 / segment.max_doc() as f32
}
impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
let mut size_sorted_segments = segments
let size_sorted_segments = segments
.iter()
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32))
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
.collect::<Vec<&SegmentMeta>>();
if size_sorted_segments.len() <= 1 {
if size_sorted_segments.is_empty() {
return vec![];
}
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
let mut current_max_log_size = f64::MAX;
let mut levels = vec![];
@@ -82,7 +116,10 @@ impl MergePolicy for LogMergePolicy {
levels
.iter()
.filter(|level| level.len() >= self.min_num_segments)
.filter(|level| {
level.len() >= self.min_num_segments
|| self.has_segment_above_deletes_threshold(level)
})
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
.collect()
}
@@ -95,6 +132,7 @@ impl Default for LogMergePolicy {
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
del_docs_ratio_before_merge: DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE,
}
}
}
@@ -288,4 +326,49 @@ mod tests {
assert_eq!(result_list[0].0[1], test_input[4].id());
assert_eq!(result_list[0].0[2], test_input[5].id());
}
#[test]
fn test_merge_single_segment_with_deletes_below_threshold() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_000, 1)];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert!(merge_candidates.is_empty());
}
#[test]
fn test_merge_single_segment_with_deletes_above_threshold() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_001, 1)];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
}
#[test]
fn test_merge_segments_with_deletes_above_threshold_all_in_level() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
create_random_segment_meta(40_000),
];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
assert_eq!(merge_candidates[0].0.len(), 2);
}
#[test]
fn test_merge_segments_with_deletes_above_threshold_different_level_not_involved() {
let mut test_merge_policy = test_merge_policy();
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
let test_input = vec![
create_random_segment_meta(100),
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
];
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
assert_eq!(merge_candidates.len(), 1);
assert_eq!(merge_candidates[0].0.len(), 1);
assert_eq!(merge_candidates[0].0[0], test_input[1].id());
}
}