Skip to main content

mito2/compaction/
run.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! This file contains code to find sorted runs in a set if ranged items and
16//! along with the best way to merge these items to satisfy the desired run count.
17
18use bytes::{Buf, Bytes};
19use common_base::BitVec;
20use common_base::readable_size::ReadableSize;
21use common_time::Timestamp;
22use smallvec::{SmallVec, smallvec};
23
24use crate::sst::file::{FileHandle, RegionFileId};
25
26/// Default max compaction output file size when not specified.
27const DEFAULT_MAX_OUTPUT_SIZE: u64 = ReadableSize::gb(2).as_bytes();
28
29/// Trait for any items with specific range (both boundaries are inclusive).
30pub trait Ranged {
31    type BoundType: Ord + Copy;
32
33    /// Returns the inclusive range of item.
34    fn range(&self) -> (Self::BoundType, Self::BoundType);
35
36    fn overlap(&self, other: &Self) -> bool {
37        let (lhs_start, lhs_end) = self.range();
38        let (rhs_start, rhs_end) = other.range();
39
40        lhs_start.max(rhs_start) < lhs_end.min(rhs_end)
41    }
42
43    /// Like `overlap`, but treats touching boundaries as overlapping (inclusive).
44    /// Used by `find_overlapping_items` where shared boundaries count as overlap.
45    fn overlap_inclusive(&self, other: &Self) -> bool {
46        let (lhs_start, lhs_end) = self.range();
47        let (rhs_start, rhs_end) = other.range();
48
49        lhs_start.max(rhs_start) <= lhs_end.min(rhs_end)
50    }
51}
52
53pub(crate) fn primary_key_ranges_overlap(lhs: &(Bytes, Bytes), rhs: &(Bytes, Bytes)) -> bool {
54    lhs.0.chunk().max(rhs.0.chunk()) <= lhs.1.chunk().min(rhs.1.chunk())
55}
56
57pub(crate) fn merge_primary_key_ranges(
58    lhs: Option<(Bytes, Bytes)>,
59    rhs: Option<(Bytes, Bytes)>,
60) -> Option<(Bytes, Bytes)> {
61    match (lhs, rhs) {
62        (Some((lhs_min, lhs_max)), Some((rhs_min, rhs_max))) => {
63            Some((lhs_min.min(rhs_min), lhs_max.max(rhs_max)))
64        }
65        _ => None,
66    }
67}
68
69pub fn find_overlapping_items<T: Item + Clone>(
70    l: &mut SortedRun<T>,
71    r: &mut SortedRun<T>,
72    result: &mut Vec<T>,
73) {
74    if l.items.is_empty() || r.items.is_empty() {
75        return;
76    }
77
78    result.clear();
79    result.reserve(l.items.len() + r.items.len());
80
81    // Sort both arrays by start boundary for more efficient overlap detection
82    if !l.sorted {
83        sort_ranged_items(&mut l.items);
84        l.sorted = true;
85    }
86    if !r.sorted {
87        sort_ranged_items(&mut r.items);
88        r.sorted = true;
89    }
90
91    let mut r_idx = 0;
92
93    let mut selected = BitVec::repeat(false, r.items().len() + l.items.len());
94
95    for (lhs_idx, lhs) in l.items.iter().enumerate() {
96        let (lhs_start, lhs_end) = lhs.range();
97
98        // Skip right elements that end before current left element starts
99        while r_idx < r.items.len() {
100            let (_, rhs_end) = r.items[r_idx].range();
101            if rhs_end < lhs_start {
102                r_idx += 1;
103            } else {
104                break;
105            }
106        }
107
108        // Check for overlaps with remaining right elements
109        let mut j = r_idx;
110        while j < r.items.len() {
111            let (rhs_start, _rhs_end) = r.items[j].range();
112
113            // If right element starts after left element ends, no more overlaps possible
114            if rhs_start > lhs_end {
115                break;
116            }
117
118            // We have an overlap (inclusive: touching boundaries count)
119            if lhs.overlap_inclusive(&r.items[j]) {
120                if !selected[lhs_idx] {
121                    result.push(lhs.clone());
122                    selected.set(lhs_idx, true);
123                }
124
125                let rhs_selected_idx = l.items.len() + j;
126                if !selected[rhs_selected_idx] {
127                    result.push(r.items[j].clone());
128                    selected.set(rhs_selected_idx, true);
129                }
130            }
131
132            j += 1;
133        }
134    }
135}
136
137// Sorts ranges by start asc and end desc.
138fn sort_ranged_items<T: Ranged>(values: &mut [T]) {
139    values.sort_unstable_by(|l, r| {
140        let (l_start, l_end) = l.range();
141        let (r_start, r_end) = r.range();
142        l_start.cmp(&r_start).then(r_end.cmp(&l_end))
143    });
144}
145
146/// Trait for items to merge.
147pub trait Item: Ranged + Clone {
148    /// Size is used to calculate the cost of merging items.
149    fn size(&self) -> usize;
150}
151
152/// A group of files that are created by the same compaction task.
153#[derive(Debug, Clone)]
154pub struct FileGroup {
155    files: SmallVec<[FileHandle; 2]>,
156    size: usize,
157    num_rows: usize,
158    min_timestamp: Timestamp,
159    max_timestamp: Timestamp,
160    primary_key_range: Option<(Bytes, Bytes)>,
161}
162
163impl FileGroup {
164    pub(crate) fn new_with_file(file: FileHandle) -> Self {
165        let size = file.size() as usize;
166        let (min_timestamp, max_timestamp) = file.time_range();
167        let num_rows = file.num_rows();
168        let primary_key_range = file.primary_key_range();
169        Self {
170            files: smallvec![file],
171            size,
172            num_rows,
173            min_timestamp,
174            max_timestamp,
175            primary_key_range,
176        }
177    }
178
179    pub(crate) fn num_rows(&self) -> usize {
180        self.num_rows
181    }
182
183    pub(crate) fn add_file(&mut self, file: FileHandle) {
184        self.size += file.size() as usize;
185        self.num_rows += file.num_rows();
186        let (min_timestamp, max_timestamp) = file.time_range();
187        self.min_timestamp = self.min_timestamp.min(min_timestamp);
188        self.max_timestamp = self.max_timestamp.max(max_timestamp);
189        self.primary_key_range =
190            merge_primary_key_ranges(self.primary_key_range.take(), file.primary_key_range());
191        self.files.push(file);
192    }
193
194    pub(crate) fn num_files(&self) -> usize {
195        self.files.len()
196    }
197
198    #[cfg(test)]
199    pub(crate) fn files(&self) -> &[FileHandle] {
200        &self.files[..]
201    }
202
203    pub(crate) fn file_ids(&self) -> SmallVec<[RegionFileId; 2]> {
204        SmallVec::from_iter(self.files.iter().map(|f| f.file_id()))
205    }
206
207    pub(crate) fn into_files(self) -> impl Iterator<Item = FileHandle> {
208        self.files.into_iter()
209    }
210}
211
212impl Ranged for FileGroup {
213    type BoundType = Timestamp;
214
215    fn range(&self) -> (Self::BoundType, Self::BoundType) {
216        (self.min_timestamp, self.max_timestamp)
217    }
218
219    fn overlap(&self, other: &Self) -> bool {
220        let (lhs_start, lhs_end) = self.range();
221        let (rhs_start, rhs_end) = other.range();
222        if lhs_start.max(rhs_start) >= lhs_end.min(rhs_end) {
223            return false;
224        }
225
226        match (&self.primary_key_range, &other.primary_key_range) {
227            (Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs),
228            _ => true,
229        }
230    }
231
232    fn overlap_inclusive(&self, other: &Self) -> bool {
233        let (lhs_start, lhs_end) = self.range();
234        let (rhs_start, rhs_end) = other.range();
235        if lhs_start.max(rhs_start) > lhs_end.min(rhs_end) {
236            return false;
237        }
238
239        match (&self.primary_key_range, &other.primary_key_range) {
240            (Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs),
241            _ => true,
242        }
243    }
244}
245
246impl Item for FileGroup {
247    fn size(&self) -> usize {
248        self.size
249    }
250}
251
252/// A set of files with non-overlapping time ranges.
253#[derive(Debug, Clone)]
254pub struct SortedRun<T: Item> {
255    /// Items to merge
256    items: Vec<T>,
257    /// The total size of all items.
258    size: usize,
259    /// The lower bound of all items.
260    start: Option<T::BoundType>,
261    /// The upper bound of all items.
262    end: Option<T::BoundType>,
263    /// Whether items are sorted.
264    sorted: bool,
265}
266
267impl<T: Item> From<Vec<T>> for SortedRun<T> {
268    fn from(items: Vec<T>) -> Self {
269        let mut r = Self {
270            items: Vec::with_capacity(items.len()),
271            size: 0,
272            start: None,
273            end: None,
274            sorted: false,
275        };
276        for item in items {
277            r.push_item(item);
278        }
279
280        r
281    }
282}
283
284impl<T> Default for SortedRun<T>
285where
286    T: Item,
287{
288    fn default() -> Self {
289        Self {
290            items: vec![],
291            size: 0,
292            start: None,
293            end: None,
294            sorted: false,
295        }
296    }
297}
298
299impl<T> SortedRun<T>
300where
301    T: Item,
302{
303    pub fn items(&self) -> &[T] {
304        &self.items
305    }
306
307    fn push_item(&mut self, t: T) {
308        let (file_start, file_end) = t.range();
309        self.size += t.size();
310        self.items.push(t);
311        self.start = Some(self.start.map_or(file_start, |v| v.min(file_start)));
312        self.end = Some(self.end.map_or(file_end, |v| v.max(file_end)));
313    }
314}
315
316/// Finds sorted runs in given items.
317pub fn find_sorted_runs<T>(items: &mut [T]) -> Vec<SortedRun<T>>
318where
319    T: Item,
320{
321    if items.is_empty() {
322        return vec![];
323    }
324    // sort files
325    sort_ranged_items(items);
326
327    let mut current_run = SortedRun::default();
328    let mut runs = vec![];
329    let mut active_run_item_indices = Vec::new();
330
331    let mut selection = BitVec::repeat(false, items.len());
332    while !selection.all() {
333        // until all items are assigned to some sorted run.
334        let mut last_pruned_start = None;
335        for (item, mut selected) in items.iter().zip(selection.iter_mut()) {
336            if *selected {
337                // item is already assigned.
338                continue;
339            }
340            if current_run.items.is_empty() {
341                // current run is empty, just add current_item
342                selected.set(true);
343                current_run.push_item(item.clone());
344                active_run_item_indices.push(current_run.items.len() - 1);
345            } else {
346                // the current item does not overlap with any item in current run,
347                // then it belongs to current run. Because now we introduced primary
348                // key range, we cannot simply use timestamps to check overlapping.
349                let (item_start, _) = item.range();
350                if last_pruned_start != Some(item_start) {
351                    active_run_item_indices.retain(|idx| {
352                        let (_, run_item_end) = current_run.items[*idx].range();
353                        run_item_end > item_start
354                    });
355                    last_pruned_start = Some(item_start);
356                }
357
358                let mut overlaps_any = false;
359                for idx in &active_run_item_indices {
360                    let run_item = &current_run.items[*idx];
361                    if run_item.overlap(item) {
362                        overlaps_any = true;
363                        break;
364                    }
365                }
366                if !overlaps_any {
367                    // does not overlap, push to current run
368                    selected.set(true);
369                    let item_idx = current_run.items.len();
370                    current_run.push_item(item.clone());
371                    active_run_item_indices.push(item_idx);
372                }
373            }
374        }
375        // finished an iteration, we've found a new run.
376        runs.push(std::mem::take(&mut current_run));
377        active_run_item_indices.clear();
378    }
379    runs
380}
381
382#[cfg(any(test, feature = "test", feature = "testing"))]
383pub fn find_sorted_runs_original<T>(items: &mut [T]) -> Vec<SortedRun<T>>
384where
385    T: Item,
386{
387    if items.is_empty() {
388        return vec![];
389    }
390    // sort files
391    sort_ranged_items(items);
392
393    let mut current_run = SortedRun::default();
394    let mut runs = vec![];
395
396    let mut selection = BitVec::repeat(false, items.len());
397    while !selection.all() {
398        // until all items are assigned to some sorted run.
399        for (item, mut selected) in items.iter().zip(selection.iter_mut()) {
400            if *selected {
401                // item is already assigned.
402                continue;
403            }
404            if current_run.items.is_empty() {
405                // current run is empty, just add current_item
406                selected.set(true);
407                current_run.push_item(item.clone());
408            } else {
409                // the current item does not overlap with any item in current run,
410                // then it belongs to current run. Because now we introduced primary
411                // key range, we cannot simply use timestamps to check overlapping.
412                let overlaps_any = current_run.items.iter().any(|i| i.overlap(item));
413                if !overlaps_any {
414                    // does not overlap, push to current run
415                    selected.set(true);
416                    current_run.push_item(item.clone());
417                }
418            }
419        }
420        // finished an iteration, we've found a new run.
421        runs.push(std::mem::take(&mut current_run));
422    }
423    runs
424}
425
426/// Finds a set of files with minimum penalty to merge that can reduce the total num of runs.
427/// The penalty of merging is defined as the size of all overlapping files between two runs.
428pub fn reduce_runs<T: Item>(mut runs: Vec<SortedRun<T>>) -> Vec<T> {
429    assert!(runs.len() > 1);
430    // sort runs by size
431    runs.sort_unstable_by_key(|a| a.size);
432    // limit max probe runs to 100
433    let probe_end = runs.len().min(100);
434    let mut min_penalty = usize::MAX;
435    let mut files = vec![];
436    let mut temp_files = vec![];
437    for i in 0..probe_end {
438        for j in i + 1..probe_end {
439            let (a, b) = runs.split_at_mut(j);
440            find_overlapping_items(&mut a[i], &mut b[0], &mut temp_files);
441            let penalty = temp_files.iter().map(|e| e.size()).sum();
442            if penalty < min_penalty {
443                min_penalty = penalty;
444                files.clear();
445                files.extend_from_slice(&temp_files);
446            }
447        }
448    }
449    files
450}
451
452/// Finds the optimal set of adjacent files to merge based on a scoring system.
453///
454/// This function evaluates all possible contiguous subsets of files to find the best
455/// candidates for merging, considering:
456///
457/// 1. File reduction - prioritizes merging more files to reduce the total count
458/// 2. Write amplification - minimizes the ratio of largest file to total size
459/// 3. Size efficiency - prefers merges that utilize available space effectively
460///
461/// When multiple merge candidates have the same score, older files (those with lower indices)
462/// are preferred.
463///
464/// # Arguments
465/// * `input_files` - Slice of files to consider for merging
466/// * `max_file_size` - Optional maximum size constraint for the merged file.
467///   If None, uses 1.5 times the average file size.
468///
469/// # Returns
470/// A vector containing the best set of adjacent files to merge.
471/// Returns an empty vector if input is empty or contains only one file.
472pub fn merge_seq_files<T: Item>(input_files: &[T], max_file_size: Option<u64>) -> Vec<T> {
473    if input_files.is_empty() || input_files.len() == 1 {
474        return vec![];
475    }
476
477    // Limit the number of files to process to 100 to control time complexity
478    let files_to_process = if input_files.len() > 100 {
479        &input_files[0..100]
480    } else {
481        input_files
482    };
483
484    // Calculate target size based on max_file_size or average file size
485    let target_size = match max_file_size {
486        Some(size) => size as usize,
487        None => {
488            // Calculate 1.5*average_file_size if max_file_size is not provided and clamp to 2GB
489            let total_size: usize = files_to_process.iter().map(|f| f.size()).sum();
490            ((((total_size as f64) / (files_to_process.len() as f64)) * 1.5) as usize)
491                .min(DEFAULT_MAX_OUTPUT_SIZE as usize)
492        }
493    };
494
495    // Find the best group of adjacent files to merge
496    let mut best_group = Vec::new();
497    let mut best_score = f64::NEG_INFINITY;
498
499    // Try different starting positions - iterate from end to start to prefer older files
500    for start_idx in (0..files_to_process.len()).rev() {
501        // Try different ending positions - also iterate from end to start
502        for end_idx in (start_idx + 1..files_to_process.len()).rev() {
503            let group = &files_to_process[start_idx..=end_idx];
504            let total_size: usize = group.iter().map(|f| f.size()).sum();
505
506            // Skip if total size exceeds target size
507            if total_size > target_size {
508                continue; // Use continue instead of break to check smaller ranges
509            }
510
511            // Calculate amplification factor (largest file size / total size)
512            let largest_file_size = group.iter().map(|f| f.size()).max().unwrap_or(0);
513            let amplification_factor = largest_file_size as f64 / total_size as f64;
514
515            // Calculate file reduction (number of files that will be reduced)
516            let file_reduction = group.len() - 1;
517
518            // Calculate score based on multiple factors:
519            // 1. File reduction (higher is better)
520            // 2. Amplification factor (lower is better)
521            // 3. Size efficiency (how close to target size)
522            let file_reduction_score = file_reduction as f64 / files_to_process.len() as f64;
523            let amp_factor_score = (1.0 - amplification_factor) * 1.5; // Lower amplification is better
524            let size_efficiency = (total_size as f64 / target_size as f64).min(1.0); // Reward using available space
525
526            let score = file_reduction_score + amp_factor_score + size_efficiency;
527
528            // Check if this group is better than our current best
529            // Use >= instead of > to prefer older files (which we encounter first due to reverse iteration)
530            if score >= best_score {
531                best_score = score;
532                best_group = group.to_vec();
533            }
534        }
535    }
536
537    best_group
538}
539
540#[cfg(test)]
541mod tests {
542    use std::collections::HashSet;
543
544    use bytes::Bytes;
545    use store_api::storage::FileId;
546
547    use super::*;
548    use crate::compaction::test_util::new_file_handle_with_size_sequence_and_primary_key_range;
549
550    #[derive(Clone, Debug, PartialEq)]
551    struct MockFile {
552        start: i64,
553        end: i64,
554        size: usize,
555    }
556
557    impl Ranged for MockFile {
558        type BoundType = i64;
559
560        fn range(&self) -> (Self::BoundType, Self::BoundType) {
561            (self.start, self.end)
562        }
563    }
564
565    impl Item for MockFile {
566        fn size(&self) -> usize {
567            self.size
568        }
569    }
570
571    fn build_items(ranges: &[(i64, i64)]) -> Vec<MockFile> {
572        ranges
573            .iter()
574            .map(|(start, end)| MockFile {
575                start: *start,
576                end: *end,
577                size: (*end - *start) as usize,
578            })
579            .collect()
580    }
581
582    fn build_items_with_size(items: &[(i64, i64, usize)]) -> Vec<MockFile> {
583        items
584            .iter()
585            .map(|(start, end, size)| MockFile {
586                start: *start,
587                end: *end,
588                size: *size,
589            })
590            .collect()
591    }
592
593    fn pk_range(min: &'static [u8], max: &'static [u8]) -> Option<(Bytes, Bytes)> {
594        Some((Bytes::from_static(min), Bytes::from_static(max)))
595    }
596
597    fn check_sorted_runs(
598        ranges: &[(i64, i64)],
599        expected_runs: &[Vec<(i64, i64)>],
600    ) -> Vec<SortedRun<MockFile>> {
601        let mut files = build_items(ranges);
602        let runs = find_sorted_runs(&mut files);
603
604        let result_file_ranges: Vec<Vec<_>> = runs
605            .iter()
606            .map(|r| r.items.iter().map(|f| f.range()).collect())
607            .collect();
608        assert_eq!(&expected_runs, &result_file_ranges);
609        runs
610    }
611
612    fn sorted_run_ranges<T: Item>(runs: &[SortedRun<T>]) -> Vec<Vec<T::BoundType>> {
613        runs.iter()
614            .map(|r| {
615                r.items
616                    .iter()
617                    .flat_map(|f| {
618                        let (start, end) = f.range();
619                        [start, end]
620                    })
621                    .collect()
622            })
623            .collect()
624    }
625
626    fn check_find_sorted_runs_consistency(ranges: &[(i64, i64)]) {
627        let mut files = build_items(ranges);
628        let mut files_for_original = files.clone();
629
630        let runs = find_sorted_runs(&mut files);
631        let original_runs = find_sorted_runs_original(&mut files_for_original);
632
633        assert_eq!(sorted_run_ranges(&original_runs), sorted_run_ranges(&runs));
634    }
635
636    #[test]
637    fn test_find_sorted_runs() {
638        check_sorted_runs(&[], &[]);
639        check_sorted_runs(&[(1, 1), (2, 2)], &[vec![(1, 1), (2, 2)]]);
640        check_sorted_runs(&[(1, 2)], &[vec![(1, 2)]]);
641        check_sorted_runs(&[(1, 2), (2, 3)], &[vec![(1, 2), (2, 3)]]);
642        check_sorted_runs(&[(1, 2), (3, 4)], &[vec![(1, 2), (3, 4)]]);
643        check_sorted_runs(&[(2, 4), (1, 3)], &[vec![(1, 3)], vec![(2, 4)]]);
644        check_sorted_runs(
645            &[(1, 3), (2, 4), (4, 5)],
646            &[vec![(1, 3), (4, 5)], vec![(2, 4)]],
647        );
648
649        check_sorted_runs(
650            &[(1, 2), (3, 4), (3, 5)],
651            &[vec![(1, 2), (3, 5)], vec![(3, 4)]],
652        );
653
654        check_sorted_runs(
655            &[(1, 3), (2, 4), (5, 6)],
656            &[vec![(1, 3), (5, 6)], vec![(2, 4)]],
657        );
658
659        check_sorted_runs(
660            &[(1, 2), (3, 5), (4, 6)],
661            &[vec![(1, 2), (3, 5)], vec![(4, 6)]],
662        );
663
664        check_sorted_runs(
665            &[(1, 2), (3, 4), (4, 6), (7, 8)],
666            &[vec![(1, 2), (3, 4), (4, 6), (7, 8)]],
667        );
668        check_sorted_runs(
669            &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
670            &[vec![(1, 2), (3, 6), (7, 8), (8, 9)], vec![(3, 4), (5, 6)]],
671        );
672
673        check_sorted_runs(
674            &[(10, 19), (20, 21), (20, 29), (30, 39)],
675            &[vec![(10, 19), (20, 29), (30, 39)], vec![(20, 21)]],
676        );
677
678        check_sorted_runs(
679            &[(10, 19), (20, 29), (21, 22), (30, 39), (31, 32), (32, 42)],
680            &[
681                vec![(10, 19), (20, 29), (30, 39)],
682                vec![(21, 22), (31, 32), (32, 42)],
683            ],
684        );
685    }
686
687    #[test]
688    fn test_find_sorted_runs_matches_original_impl() {
689        for ranges in [
690            &[][..],
691            &[(1, 1), (2, 2)],
692            &[(1, 2), (2, 3)],
693            &[(2, 4), (1, 3)],
694            &[(1, 3), (2, 4), (4, 5)],
695            &[(1, 2), (3, 4), (3, 5)],
696            &[(1, 3), (2, 4), (5, 6)],
697            &[(1, 2), (3, 5), (4, 6)],
698            &[(1, 2), (3, 4), (4, 6), (7, 8)],
699            &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
700            &[(10, 19), (20, 21), (20, 29), (30, 39)],
701            &[(10, 19), (20, 29), (21, 22), (30, 39), (31, 32), (32, 42)],
702            &[(32, 42), (10, 19), (31, 32), (20, 29), (21, 22), (30, 39)],
703        ] {
704            check_find_sorted_runs_consistency(ranges);
705        }
706    }
707
708    fn check_reduce_runs(
709        files: &[(i64, i64)],
710        expected_runs: &[Vec<(i64, i64)>],
711        expected: &[(i64, i64)],
712    ) {
713        let runs = check_sorted_runs(files, expected_runs);
714        if runs.len() <= 1 {
715            assert!(expected.is_empty());
716            return;
717        }
718        let files_to_merge = reduce_runs(runs);
719        let file_to_merge_timestamps = files_to_merge
720            .into_iter()
721            .map(|f| (f.start, f.end))
722            .collect::<HashSet<_>>();
723
724        let expected = expected.iter().cloned().collect::<HashSet<_>>();
725        assert_eq!(&expected, &file_to_merge_timestamps);
726    }
727
728    #[test]
729    fn test_reduce_runs() {
730        // [1..3]   [5..6]
731        //   [2..4]
732        check_reduce_runs(
733            &[(1, 3), (2, 4), (5, 6)],
734            &[vec![(1, 3), (5, 6)], vec![(2, 4)]],
735            &[(1, 3), (2, 4)],
736        );
737
738        // [1..2][3..5]
739        //         [4..6]
740        check_reduce_runs(
741            &[(1, 2), (3, 5), (4, 6)],
742            &[vec![(1, 2), (3, 5)], vec![(4, 6)]],
743            &[(3, 5), (4, 6)],
744        );
745
746        // [1..2][3..4]    [7..8]
747        //          [4..6]
748        check_reduce_runs(
749            &[(1, 2), (3, 4), (4, 6), (7, 8)],
750            &[vec![(1, 2), (3, 4), (4, 6), (7, 8)]],
751            &[],
752        );
753
754        // [1..2][3........6][7..8][8..9]
755        //       [3..4][5..6]
756        check_reduce_runs(
757            &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
758            &[vec![(1, 2), (3, 6), (7, 8), (8, 9)], vec![(3, 4), (5, 6)]],
759            &[(5, 6), (3, 4), (3, 6)], // already satisfied
760        );
761
762        // [1..2][3........6][7..8][8..9]
763        //       [3..4][5..6]
764        check_reduce_runs(
765            &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
766            &[vec![(1, 2), (3, 6), (7, 8), (8, 9)], vec![(3, 4), (5, 6)]],
767            &[(3, 4), (3, 6), (5, 6)],
768        );
769
770        // [10..20] [30..40] [50........80][80...100][100..110]
771        //                   [50..60]  [80..90]
772        //
773        check_reduce_runs(
774            &[
775                (10, 20),
776                (30, 40),
777                (50, 60),
778                (50, 80),
779                (80, 90),
780                (80, 100),
781                (100, 110),
782            ],
783            &[
784                vec![(10, 20), (30, 40), (50, 80), (80, 100), (100, 110)],
785                vec![(50, 60), (80, 90)],
786            ],
787            &[(50, 80), (80, 100), (50, 60), (80, 90)],
788        );
789
790        // [0..10]
791        // [0...11]
792        // [0....12]
793        // [0.....13]
794        check_reduce_runs(
795            &[(0, 10), (0, 11), (0, 12), (0, 13)],
796            &[vec![(0, 13)], vec![(0, 12)], vec![(0, 11)], vec![(0, 10)]],
797            &[(0, 10), (0, 11)],
798        );
799    }
800
801    #[test]
802    fn test_find_overlapping_items() {
803        let mut result = Vec::new();
804
805        // Test empty inputs
806        find_overlapping_items(
807            &mut SortedRun::from(Vec::<MockFile>::new()),
808            &mut SortedRun::from(Vec::<MockFile>::new()),
809            &mut result,
810        );
811        assert_eq!(result, Vec::<MockFile>::new());
812
813        let files1 = build_items(&[(1, 3)]);
814        find_overlapping_items(
815            &mut SortedRun::from(files1.clone()),
816            &mut SortedRun::from(Vec::<MockFile>::new()),
817            &mut result,
818        );
819        assert_eq!(result, Vec::<MockFile>::new());
820
821        find_overlapping_items(
822            &mut SortedRun::from(Vec::<MockFile>::new()),
823            &mut SortedRun::from(files1.clone()),
824            &mut result,
825        );
826        assert_eq!(result, Vec::<MockFile>::new());
827
828        // Test non-overlapping ranges
829        let files1 = build_items(&[(1, 3), (5, 7)]);
830        let files2 = build_items(&[(10, 12), (15, 20)]);
831        find_overlapping_items(
832            &mut SortedRun::from(files1),
833            &mut SortedRun::from(files2),
834            &mut result,
835        );
836        assert_eq!(result, Vec::<MockFile>::new());
837
838        // Test simple overlap
839        let files1 = build_items(&[(1, 5)]);
840        let files2 = build_items(&[(3, 7)]);
841        find_overlapping_items(
842            &mut SortedRun::from(files1),
843            &mut SortedRun::from(files2),
844            &mut result,
845        );
846        assert_eq!(result.len(), 2);
847        assert_eq!(result[0].range(), (1, 5));
848        assert_eq!(result[1].range(), (3, 7));
849
850        // Test multiple overlaps
851        let files1 = build_items(&[(1, 5), (8, 12), (15, 20)]);
852        let files2 = build_items(&[(3, 6), (7, 10), (18, 25)]);
853        find_overlapping_items(
854            &mut SortedRun::from(files1),
855            &mut SortedRun::from(files2),
856            &mut result,
857        );
858        assert_eq!(result.len(), 6);
859
860        // Test boundary cases (touching but not overlapping)
861        let files1 = build_items(&[(1, 5)]);
862        let files2 = build_items(&[(5, 10)]); // Touching at 5
863        find_overlapping_items(
864            &mut SortedRun::from(files1),
865            &mut SortedRun::from(files2),
866            &mut result,
867        );
868        assert_eq!(result.len(), 2); // Should overlap since ranges are inclusive
869
870        // Test completely contained ranges
871        let files1 = build_items(&[(1, 10)]);
872        let files2 = build_items(&[(3, 7)]);
873        find_overlapping_items(
874            &mut SortedRun::from(files1),
875            &mut SortedRun::from(files2),
876            &mut result,
877        );
878        assert_eq!(result.len(), 2);
879
880        // Test identical ranges
881        let files1 = build_items(&[(1, 5)]);
882        let files2 = build_items(&[(1, 5)]);
883        find_overlapping_items(
884            &mut SortedRun::from(files1),
885            &mut SortedRun::from(files2),
886            &mut result,
887        );
888        assert_eq!(result.len(), 2);
889
890        // Test unsorted input handling
891        let files1 = build_items(&[(5, 10), (1, 3)]); // Unsorted
892        let files2 = build_items(&[(2, 7), (8, 12)]); // Unsorted
893        find_overlapping_items(
894            &mut SortedRun::from(files1),
895            &mut SortedRun::from(files2),
896            &mut result,
897        );
898        assert_eq!(result.len(), 4); // Should find both overlaps
899    }
900
901    #[test]
902    fn test_file_group_overlap_time_overlap_pk_disjoint() {
903        let lhs =
904            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
905                FileId::random(),
906                0,
907                100,
908                0,
909                1,
910                10,
911                pk_range(b"a", b"f"),
912            ));
913        let rhs =
914            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
915                FileId::random(),
916                50,
917                150,
918                0,
919                2,
920                10,
921                pk_range(b"x", b"z"),
922            ));
923
924        assert!(!lhs.overlap(&rhs));
925    }
926
927    #[test]
928    fn test_find_sorted_runs_collapses_pk_disjoint_files_into_one_run() {
929        let mut files = vec![
930            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
931                FileId::random(),
932                0,
933                100,
934                0,
935                1,
936                10,
937                pk_range(b"a", b"f"),
938            )),
939            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
940                FileId::random(),
941                50,
942                150,
943                0,
944                2,
945                10,
946                pk_range(b"x", b"z"),
947            )),
948        ];
949
950        let runs = find_sorted_runs(&mut files);
951
952        assert_eq!(1, runs.len());
953        assert_eq!(2, runs[0].items().len());
954    }
955
956    #[test]
957    fn test_find_sorted_runs_handles_2d_transitivity_break() {
958        let mut files = vec![
959            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
960                FileId::random(),
961                0,
962                100,
963                0,
964                1,
965                10,
966                pk_range(b"a", b"f"),
967            )),
968            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
969                FileId::random(),
970                50,
971                150,
972                0,
973                2,
974                10,
975                pk_range(b"x", b"z"),
976            )),
977            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
978                FileId::random(),
979                50,
980                150,
981                0,
982                3,
983                10,
984                pk_range(b"a", b"f"),
985            )),
986        ];
987
988        let runs = find_sorted_runs(&mut files);
989
990        assert_eq!(2, runs.len());
991        assert_eq!(2, runs[0].items().len());
992        assert_eq!(1, runs[1].items().len());
993    }
994
995    #[test]
996    fn test_find_overlapping_items_skips_pk_disjoint_pairs() {
997        let mut left = SortedRun::from(vec![FileGroup::new_with_file(
998            new_file_handle_with_size_sequence_and_primary_key_range(
999                FileId::random(),
1000                0,
1001                100,
1002                0,
1003                1,
1004                10,
1005                pk_range(b"a", b"f"),
1006            ),
1007        )]);
1008        let mut right = SortedRun::from(vec![FileGroup::new_with_file(
1009            new_file_handle_with_size_sequence_and_primary_key_range(
1010                FileId::random(),
1011                50,
1012                150,
1013                0,
1014                2,
1015                10,
1016                pk_range(b"x", b"z"),
1017            ),
1018        )]);
1019        let mut result = Vec::new();
1020
1021        find_overlapping_items(&mut left, &mut right, &mut result);
1022
1023        assert!(result.is_empty());
1024    }
1025
1026    #[test]
1027    fn test_file_group_touching_time_boundary_with_same_pk_is_not_overlap() {
1028        let lhs =
1029            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
1030                FileId::random(),
1031                0,
1032                100,
1033                0,
1034                1,
1035                10,
1036                pk_range(b"a", b"f"),
1037            ));
1038        let rhs =
1039            FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
1040                FileId::random(),
1041                100,
1042                150,
1043                0,
1044                2,
1045                10,
1046                pk_range(b"a", b"f"),
1047            ));
1048
1049        assert!(!lhs.overlap(&rhs));
1050    }
1051
1052    #[test]
1053    fn test_merge_seq_files() {
1054        // Test empty input
1055        let files = Vec::<MockFile>::new();
1056        assert_eq!(merge_seq_files(&files, None), Vec::<MockFile>::new());
1057
1058        // Test single file input (should return empty vec as no merge needed)
1059        let files = build_items(&[(1, 5)]);
1060        assert_eq!(merge_seq_files(&files, None), Vec::<MockFile>::new());
1061
1062        // Test the example case: [10, 1, 1, 1] - should merge the last three files
1063        let files = build_items_with_size(&[(1, 2, 10), (3, 4, 1), (5, 6, 1), (7, 8, 1)]);
1064        let result = merge_seq_files(&files, None);
1065        assert_eq!(result.len(), 3);
1066        assert_eq!(result[0].size, 1);
1067        assert_eq!(result[1].size, 1);
1068        assert_eq!(result[2].size, 1);
1069
1070        // Test with files of equal size - should merge as many as possible
1071        let files = build_items_with_size(&[(1, 2, 5), (3, 4, 5), (5, 6, 5), (7, 8, 5)]);
1072        let result = merge_seq_files(&files, Some(20));
1073        assert_eq!(result.len(), 4); // Should merge all 4 files as total size is 20
1074
1075        // Test with max_file_size constraint
1076        let files = build_items_with_size(&[(1, 2, 5), (3, 4, 5), (5, 6, 5), (7, 8, 5)]);
1077        let result = merge_seq_files(&files, Some(10));
1078        assert_eq!(result.len(), 2); // Should merge only 2 files as max size is 10
1079
1080        // Test with uneven file sizes - should prioritize reducing file count
1081        let files = build_items_with_size(&[(1, 2, 2), (3, 4, 3), (5, 6, 4), (7, 8, 10)]);
1082        let result = merge_seq_files(&files, Some(10));
1083        assert_eq!(result.len(), 3); // Should merge the first 3 files (total size 9)
1084
1085        // Test amplification factor prioritization
1086        // Two possible merges: [5, 5] (amp factor 0.5) vs [10, 1, 1] (amp factor 0.83)
1087        let files =
1088            build_items_with_size(&[(1, 2, 5), (3, 4, 5), (5, 6, 10), (7, 8, 1), (9, 10, 1)]);
1089        let result = merge_seq_files(&files, Some(12));
1090        assert_eq!(result.len(), 2);
1091        assert_eq!(result[0].size, 5);
1092        assert_eq!(result[1].size, 5);
1093
1094        // Test with large file preventing merges
1095        let files = build_items_with_size(&[(1, 2, 100), (3, 4, 1), (5, 6, 1), (7, 8, 1)]);
1096        let result = merge_seq_files(&files, Some(10));
1097        assert_eq!(result.len(), 3); // Should merge the last 3 small files
1098        assert_eq!(result[0].size, 1);
1099        assert_eq!(result[1].size, 1);
1100        assert_eq!(result[2].size, 1);
1101
1102        let files = build_items_with_size(&[(1, 2, 100), (3, 4, 20), (5, 6, 20), (7, 8, 20)]);
1103        let result = merge_seq_files(&files, Some(200));
1104        assert_eq!(result.len(), 4);
1105
1106        let files = build_items_with_size(&[(1, 2, 160), (3, 4, 20), (5, 6, 20), (7, 8, 20)]);
1107        let result = merge_seq_files(&files, None);
1108        assert_eq!(result.len(), 3);
1109        assert_eq!(result[0].size, 20);
1110        assert_eq!(result[1].size, 20);
1111        assert_eq!(result[2].size, 20);
1112
1113        let files = build_items_with_size(&[(1, 2, 100), (3, 4, 1)]);
1114        let result = merge_seq_files(&files, Some(200));
1115        assert_eq!(result.len(), 2);
1116        assert_eq!(result[0].size, 100);
1117        assert_eq!(result[1].size, 1);
1118
1119        let files = build_items_with_size(&[(1, 2, 20), (3, 4, 20), (5, 6, 20), (7, 8, 20)]);
1120        let result = merge_seq_files(&files, Some(40));
1121        assert_eq!(result.len(), 2);
1122        assert_eq!(result[0].start, 1);
1123        assert_eq!(result[1].start, 3);
1124    }
1125}