1use bytes::{Buf, Bytes};
19use common_base::BitVec;
20use common_base::readable_size::ReadableSize;
21use common_time::Timestamp;
22use smallvec::{SmallVec, smallvec};
23
24use crate::sst::file::{FileHandle, RegionFileId};
25
26const DEFAULT_MAX_OUTPUT_SIZE: u64 = ReadableSize::gb(2).as_bytes();
28
29pub trait Ranged {
31 type BoundType: Ord + Copy;
32
33 fn range(&self) -> (Self::BoundType, Self::BoundType);
35
36 fn overlap(&self, other: &Self) -> bool {
37 let (lhs_start, lhs_end) = self.range();
38 let (rhs_start, rhs_end) = other.range();
39
40 lhs_start.max(rhs_start) < lhs_end.min(rhs_end)
41 }
42
43 fn overlap_inclusive(&self, other: &Self) -> bool {
46 let (lhs_start, lhs_end) = self.range();
47 let (rhs_start, rhs_end) = other.range();
48
49 lhs_start.max(rhs_start) <= lhs_end.min(rhs_end)
50 }
51}
52
53pub(crate) fn primary_key_ranges_overlap(lhs: &(Bytes, Bytes), rhs: &(Bytes, Bytes)) -> bool {
54 lhs.0.chunk().max(rhs.0.chunk()) <= lhs.1.chunk().min(rhs.1.chunk())
55}
56
57pub(crate) fn merge_primary_key_ranges(
58 lhs: Option<(Bytes, Bytes)>,
59 rhs: Option<(Bytes, Bytes)>,
60) -> Option<(Bytes, Bytes)> {
61 match (lhs, rhs) {
62 (Some((lhs_min, lhs_max)), Some((rhs_min, rhs_max))) => {
63 Some((lhs_min.min(rhs_min), lhs_max.max(rhs_max)))
64 }
65 _ => None,
66 }
67}
68
69pub fn find_overlapping_items<T: Item + Clone>(
70 l: &mut SortedRun<T>,
71 r: &mut SortedRun<T>,
72 result: &mut Vec<T>,
73) {
74 if l.items.is_empty() || r.items.is_empty() {
75 return;
76 }
77
78 result.clear();
79 result.reserve(l.items.len() + r.items.len());
80
81 if !l.sorted {
83 sort_ranged_items(&mut l.items);
84 l.sorted = true;
85 }
86 if !r.sorted {
87 sort_ranged_items(&mut r.items);
88 r.sorted = true;
89 }
90
91 let mut r_idx = 0;
92
93 let mut selected = BitVec::repeat(false, r.items().len() + l.items.len());
94
95 for (lhs_idx, lhs) in l.items.iter().enumerate() {
96 let (lhs_start, lhs_end) = lhs.range();
97
98 while r_idx < r.items.len() {
100 let (_, rhs_end) = r.items[r_idx].range();
101 if rhs_end < lhs_start {
102 r_idx += 1;
103 } else {
104 break;
105 }
106 }
107
108 let mut j = r_idx;
110 while j < r.items.len() {
111 let (rhs_start, _rhs_end) = r.items[j].range();
112
113 if rhs_start > lhs_end {
115 break;
116 }
117
118 if lhs.overlap_inclusive(&r.items[j]) {
120 if !selected[lhs_idx] {
121 result.push(lhs.clone());
122 selected.set(lhs_idx, true);
123 }
124
125 let rhs_selected_idx = l.items.len() + j;
126 if !selected[rhs_selected_idx] {
127 result.push(r.items[j].clone());
128 selected.set(rhs_selected_idx, true);
129 }
130 }
131
132 j += 1;
133 }
134 }
135}
136
137fn sort_ranged_items<T: Ranged>(values: &mut [T]) {
139 values.sort_unstable_by(|l, r| {
140 let (l_start, l_end) = l.range();
141 let (r_start, r_end) = r.range();
142 l_start.cmp(&r_start).then(r_end.cmp(&l_end))
143 });
144}
145
146pub trait Item: Ranged + Clone {
148 fn size(&self) -> usize;
150}
151
152#[derive(Debug, Clone)]
154pub struct FileGroup {
155 files: SmallVec<[FileHandle; 2]>,
156 size: usize,
157 num_rows: usize,
158 min_timestamp: Timestamp,
159 max_timestamp: Timestamp,
160 primary_key_range: Option<(Bytes, Bytes)>,
161}
162
163impl FileGroup {
164 pub(crate) fn new_with_file(file: FileHandle) -> Self {
165 let size = file.size() as usize;
166 let (min_timestamp, max_timestamp) = file.time_range();
167 let num_rows = file.num_rows();
168 let primary_key_range = file.primary_key_range();
169 Self {
170 files: smallvec![file],
171 size,
172 num_rows,
173 min_timestamp,
174 max_timestamp,
175 primary_key_range,
176 }
177 }
178
179 pub(crate) fn num_rows(&self) -> usize {
180 self.num_rows
181 }
182
183 pub(crate) fn add_file(&mut self, file: FileHandle) {
184 self.size += file.size() as usize;
185 self.num_rows += file.num_rows();
186 let (min_timestamp, max_timestamp) = file.time_range();
187 self.min_timestamp = self.min_timestamp.min(min_timestamp);
188 self.max_timestamp = self.max_timestamp.max(max_timestamp);
189 self.primary_key_range =
190 merge_primary_key_ranges(self.primary_key_range.take(), file.primary_key_range());
191 self.files.push(file);
192 }
193
194 pub(crate) fn num_files(&self) -> usize {
195 self.files.len()
196 }
197
198 #[cfg(test)]
199 pub(crate) fn files(&self) -> &[FileHandle] {
200 &self.files[..]
201 }
202
203 pub(crate) fn file_ids(&self) -> SmallVec<[RegionFileId; 2]> {
204 SmallVec::from_iter(self.files.iter().map(|f| f.file_id()))
205 }
206
207 pub(crate) fn into_files(self) -> impl Iterator<Item = FileHandle> {
208 self.files.into_iter()
209 }
210}
211
212impl Ranged for FileGroup {
213 type BoundType = Timestamp;
214
215 fn range(&self) -> (Self::BoundType, Self::BoundType) {
216 (self.min_timestamp, self.max_timestamp)
217 }
218
219 fn overlap(&self, other: &Self) -> bool {
220 let (lhs_start, lhs_end) = self.range();
221 let (rhs_start, rhs_end) = other.range();
222 if lhs_start.max(rhs_start) >= lhs_end.min(rhs_end) {
223 return false;
224 }
225
226 match (&self.primary_key_range, &other.primary_key_range) {
227 (Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs),
228 _ => true,
229 }
230 }
231
232 fn overlap_inclusive(&self, other: &Self) -> bool {
233 let (lhs_start, lhs_end) = self.range();
234 let (rhs_start, rhs_end) = other.range();
235 if lhs_start.max(rhs_start) > lhs_end.min(rhs_end) {
236 return false;
237 }
238
239 match (&self.primary_key_range, &other.primary_key_range) {
240 (Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs),
241 _ => true,
242 }
243 }
244}
245
246impl Item for FileGroup {
247 fn size(&self) -> usize {
248 self.size
249 }
250}
251
252#[derive(Debug, Clone)]
254pub struct SortedRun<T: Item> {
255 items: Vec<T>,
257 size: usize,
259 start: Option<T::BoundType>,
261 end: Option<T::BoundType>,
263 sorted: bool,
265}
266
267impl<T: Item> From<Vec<T>> for SortedRun<T> {
268 fn from(items: Vec<T>) -> Self {
269 let mut r = Self {
270 items: Vec::with_capacity(items.len()),
271 size: 0,
272 start: None,
273 end: None,
274 sorted: false,
275 };
276 for item in items {
277 r.push_item(item);
278 }
279
280 r
281 }
282}
283
284impl<T> Default for SortedRun<T>
285where
286 T: Item,
287{
288 fn default() -> Self {
289 Self {
290 items: vec![],
291 size: 0,
292 start: None,
293 end: None,
294 sorted: false,
295 }
296 }
297}
298
299impl<T> SortedRun<T>
300where
301 T: Item,
302{
303 pub fn items(&self) -> &[T] {
304 &self.items
305 }
306
307 fn push_item(&mut self, t: T) {
308 let (file_start, file_end) = t.range();
309 self.size += t.size();
310 self.items.push(t);
311 self.start = Some(self.start.map_or(file_start, |v| v.min(file_start)));
312 self.end = Some(self.end.map_or(file_end, |v| v.max(file_end)));
313 }
314}
315
316pub fn find_sorted_runs<T>(items: &mut [T]) -> Vec<SortedRun<T>>
318where
319 T: Item,
320{
321 if items.is_empty() {
322 return vec![];
323 }
324 sort_ranged_items(items);
326
327 let mut current_run = SortedRun::default();
328 let mut runs = vec![];
329 let mut active_run_item_indices = Vec::new();
330
331 let mut selection = BitVec::repeat(false, items.len());
332 while !selection.all() {
333 let mut last_pruned_start = None;
335 for (item, mut selected) in items.iter().zip(selection.iter_mut()) {
336 if *selected {
337 continue;
339 }
340 if current_run.items.is_empty() {
341 selected.set(true);
343 current_run.push_item(item.clone());
344 active_run_item_indices.push(current_run.items.len() - 1);
345 } else {
346 let (item_start, _) = item.range();
350 if last_pruned_start != Some(item_start) {
351 active_run_item_indices.retain(|idx| {
352 let (_, run_item_end) = current_run.items[*idx].range();
353 run_item_end > item_start
354 });
355 last_pruned_start = Some(item_start);
356 }
357
358 let mut overlaps_any = false;
359 for idx in &active_run_item_indices {
360 let run_item = ¤t_run.items[*idx];
361 if run_item.overlap(item) {
362 overlaps_any = true;
363 break;
364 }
365 }
366 if !overlaps_any {
367 selected.set(true);
369 let item_idx = current_run.items.len();
370 current_run.push_item(item.clone());
371 active_run_item_indices.push(item_idx);
372 }
373 }
374 }
375 runs.push(std::mem::take(&mut current_run));
377 active_run_item_indices.clear();
378 }
379 runs
380}
381
382#[cfg(any(test, feature = "test", feature = "testing"))]
383pub fn find_sorted_runs_original<T>(items: &mut [T]) -> Vec<SortedRun<T>>
384where
385 T: Item,
386{
387 if items.is_empty() {
388 return vec![];
389 }
390 sort_ranged_items(items);
392
393 let mut current_run = SortedRun::default();
394 let mut runs = vec![];
395
396 let mut selection = BitVec::repeat(false, items.len());
397 while !selection.all() {
398 for (item, mut selected) in items.iter().zip(selection.iter_mut()) {
400 if *selected {
401 continue;
403 }
404 if current_run.items.is_empty() {
405 selected.set(true);
407 current_run.push_item(item.clone());
408 } else {
409 let overlaps_any = current_run.items.iter().any(|i| i.overlap(item));
413 if !overlaps_any {
414 selected.set(true);
416 current_run.push_item(item.clone());
417 }
418 }
419 }
420 runs.push(std::mem::take(&mut current_run));
422 }
423 runs
424}
425
426pub fn reduce_runs<T: Item>(mut runs: Vec<SortedRun<T>>) -> Vec<T> {
429 assert!(runs.len() > 1);
430 runs.sort_unstable_by_key(|a| a.size);
432 let probe_end = runs.len().min(100);
434 let mut min_penalty = usize::MAX;
435 let mut files = vec![];
436 let mut temp_files = vec![];
437 for i in 0..probe_end {
438 for j in i + 1..probe_end {
439 let (a, b) = runs.split_at_mut(j);
440 find_overlapping_items(&mut a[i], &mut b[0], &mut temp_files);
441 let penalty = temp_files.iter().map(|e| e.size()).sum();
442 if penalty < min_penalty {
443 min_penalty = penalty;
444 files.clear();
445 files.extend_from_slice(&temp_files);
446 }
447 }
448 }
449 files
450}
451
452pub fn merge_seq_files<T: Item>(input_files: &[T], max_file_size: Option<u64>) -> Vec<T> {
473 if input_files.is_empty() || input_files.len() == 1 {
474 return vec![];
475 }
476
477 let files_to_process = if input_files.len() > 100 {
479 &input_files[0..100]
480 } else {
481 input_files
482 };
483
484 let target_size = match max_file_size {
486 Some(size) => size as usize,
487 None => {
488 let total_size: usize = files_to_process.iter().map(|f| f.size()).sum();
490 ((((total_size as f64) / (files_to_process.len() as f64)) * 1.5) as usize)
491 .min(DEFAULT_MAX_OUTPUT_SIZE as usize)
492 }
493 };
494
495 let mut best_group = Vec::new();
497 let mut best_score = f64::NEG_INFINITY;
498
499 for start_idx in (0..files_to_process.len()).rev() {
501 for end_idx in (start_idx + 1..files_to_process.len()).rev() {
503 let group = &files_to_process[start_idx..=end_idx];
504 let total_size: usize = group.iter().map(|f| f.size()).sum();
505
506 if total_size > target_size {
508 continue; }
510
511 let largest_file_size = group.iter().map(|f| f.size()).max().unwrap_or(0);
513 let amplification_factor = largest_file_size as f64 / total_size as f64;
514
515 let file_reduction = group.len() - 1;
517
518 let file_reduction_score = file_reduction as f64 / files_to_process.len() as f64;
523 let amp_factor_score = (1.0 - amplification_factor) * 1.5; let size_efficiency = (total_size as f64 / target_size as f64).min(1.0); let score = file_reduction_score + amp_factor_score + size_efficiency;
527
528 if score >= best_score {
531 best_score = score;
532 best_group = group.to_vec();
533 }
534 }
535 }
536
537 best_group
538}
539
540#[cfg(test)]
541mod tests {
542 use std::collections::HashSet;
543
544 use bytes::Bytes;
545 use store_api::storage::FileId;
546
547 use super::*;
548 use crate::compaction::test_util::new_file_handle_with_size_sequence_and_primary_key_range;
549
550 #[derive(Clone, Debug, PartialEq)]
551 struct MockFile {
552 start: i64,
553 end: i64,
554 size: usize,
555 }
556
557 impl Ranged for MockFile {
558 type BoundType = i64;
559
560 fn range(&self) -> (Self::BoundType, Self::BoundType) {
561 (self.start, self.end)
562 }
563 }
564
565 impl Item for MockFile {
566 fn size(&self) -> usize {
567 self.size
568 }
569 }
570
571 fn build_items(ranges: &[(i64, i64)]) -> Vec<MockFile> {
572 ranges
573 .iter()
574 .map(|(start, end)| MockFile {
575 start: *start,
576 end: *end,
577 size: (*end - *start) as usize,
578 })
579 .collect()
580 }
581
582 fn build_items_with_size(items: &[(i64, i64, usize)]) -> Vec<MockFile> {
583 items
584 .iter()
585 .map(|(start, end, size)| MockFile {
586 start: *start,
587 end: *end,
588 size: *size,
589 })
590 .collect()
591 }
592
593 fn pk_range(min: &'static [u8], max: &'static [u8]) -> Option<(Bytes, Bytes)> {
594 Some((Bytes::from_static(min), Bytes::from_static(max)))
595 }
596
597 fn check_sorted_runs(
598 ranges: &[(i64, i64)],
599 expected_runs: &[Vec<(i64, i64)>],
600 ) -> Vec<SortedRun<MockFile>> {
601 let mut files = build_items(ranges);
602 let runs = find_sorted_runs(&mut files);
603
604 let result_file_ranges: Vec<Vec<_>> = runs
605 .iter()
606 .map(|r| r.items.iter().map(|f| f.range()).collect())
607 .collect();
608 assert_eq!(&expected_runs, &result_file_ranges);
609 runs
610 }
611
612 fn sorted_run_ranges<T: Item>(runs: &[SortedRun<T>]) -> Vec<Vec<T::BoundType>> {
613 runs.iter()
614 .map(|r| {
615 r.items
616 .iter()
617 .flat_map(|f| {
618 let (start, end) = f.range();
619 [start, end]
620 })
621 .collect()
622 })
623 .collect()
624 }
625
626 fn check_find_sorted_runs_consistency(ranges: &[(i64, i64)]) {
627 let mut files = build_items(ranges);
628 let mut files_for_original = files.clone();
629
630 let runs = find_sorted_runs(&mut files);
631 let original_runs = find_sorted_runs_original(&mut files_for_original);
632
633 assert_eq!(sorted_run_ranges(&original_runs), sorted_run_ranges(&runs));
634 }
635
636 #[test]
637 fn test_find_sorted_runs() {
638 check_sorted_runs(&[], &[]);
639 check_sorted_runs(&[(1, 1), (2, 2)], &[vec![(1, 1), (2, 2)]]);
640 check_sorted_runs(&[(1, 2)], &[vec![(1, 2)]]);
641 check_sorted_runs(&[(1, 2), (2, 3)], &[vec![(1, 2), (2, 3)]]);
642 check_sorted_runs(&[(1, 2), (3, 4)], &[vec![(1, 2), (3, 4)]]);
643 check_sorted_runs(&[(2, 4), (1, 3)], &[vec![(1, 3)], vec![(2, 4)]]);
644 check_sorted_runs(
645 &[(1, 3), (2, 4), (4, 5)],
646 &[vec![(1, 3), (4, 5)], vec![(2, 4)]],
647 );
648
649 check_sorted_runs(
650 &[(1, 2), (3, 4), (3, 5)],
651 &[vec![(1, 2), (3, 5)], vec![(3, 4)]],
652 );
653
654 check_sorted_runs(
655 &[(1, 3), (2, 4), (5, 6)],
656 &[vec![(1, 3), (5, 6)], vec![(2, 4)]],
657 );
658
659 check_sorted_runs(
660 &[(1, 2), (3, 5), (4, 6)],
661 &[vec![(1, 2), (3, 5)], vec![(4, 6)]],
662 );
663
664 check_sorted_runs(
665 &[(1, 2), (3, 4), (4, 6), (7, 8)],
666 &[vec![(1, 2), (3, 4), (4, 6), (7, 8)]],
667 );
668 check_sorted_runs(
669 &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
670 &[vec![(1, 2), (3, 6), (7, 8), (8, 9)], vec![(3, 4), (5, 6)]],
671 );
672
673 check_sorted_runs(
674 &[(10, 19), (20, 21), (20, 29), (30, 39)],
675 &[vec![(10, 19), (20, 29), (30, 39)], vec![(20, 21)]],
676 );
677
678 check_sorted_runs(
679 &[(10, 19), (20, 29), (21, 22), (30, 39), (31, 32), (32, 42)],
680 &[
681 vec![(10, 19), (20, 29), (30, 39)],
682 vec![(21, 22), (31, 32), (32, 42)],
683 ],
684 );
685 }
686
687 #[test]
688 fn test_find_sorted_runs_matches_original_impl() {
689 for ranges in [
690 &[][..],
691 &[(1, 1), (2, 2)],
692 &[(1, 2), (2, 3)],
693 &[(2, 4), (1, 3)],
694 &[(1, 3), (2, 4), (4, 5)],
695 &[(1, 2), (3, 4), (3, 5)],
696 &[(1, 3), (2, 4), (5, 6)],
697 &[(1, 2), (3, 5), (4, 6)],
698 &[(1, 2), (3, 4), (4, 6), (7, 8)],
699 &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
700 &[(10, 19), (20, 21), (20, 29), (30, 39)],
701 &[(10, 19), (20, 29), (21, 22), (30, 39), (31, 32), (32, 42)],
702 &[(32, 42), (10, 19), (31, 32), (20, 29), (21, 22), (30, 39)],
703 ] {
704 check_find_sorted_runs_consistency(ranges);
705 }
706 }
707
708 fn check_reduce_runs(
709 files: &[(i64, i64)],
710 expected_runs: &[Vec<(i64, i64)>],
711 expected: &[(i64, i64)],
712 ) {
713 let runs = check_sorted_runs(files, expected_runs);
714 if runs.len() <= 1 {
715 assert!(expected.is_empty());
716 return;
717 }
718 let files_to_merge = reduce_runs(runs);
719 let file_to_merge_timestamps = files_to_merge
720 .into_iter()
721 .map(|f| (f.start, f.end))
722 .collect::<HashSet<_>>();
723
724 let expected = expected.iter().cloned().collect::<HashSet<_>>();
725 assert_eq!(&expected, &file_to_merge_timestamps);
726 }
727
728 #[test]
729 fn test_reduce_runs() {
730 check_reduce_runs(
733 &[(1, 3), (2, 4), (5, 6)],
734 &[vec![(1, 3), (5, 6)], vec![(2, 4)]],
735 &[(1, 3), (2, 4)],
736 );
737
738 check_reduce_runs(
741 &[(1, 2), (3, 5), (4, 6)],
742 &[vec![(1, 2), (3, 5)], vec![(4, 6)]],
743 &[(3, 5), (4, 6)],
744 );
745
746 check_reduce_runs(
749 &[(1, 2), (3, 4), (4, 6), (7, 8)],
750 &[vec![(1, 2), (3, 4), (4, 6), (7, 8)]],
751 &[],
752 );
753
754 check_reduce_runs(
757 &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
758 &[vec![(1, 2), (3, 6), (7, 8), (8, 9)], vec![(3, 4), (5, 6)]],
759 &[(5, 6), (3, 4), (3, 6)], );
761
762 check_reduce_runs(
765 &[(1, 2), (3, 4), (5, 6), (3, 6), (7, 8), (8, 9)],
766 &[vec![(1, 2), (3, 6), (7, 8), (8, 9)], vec![(3, 4), (5, 6)]],
767 &[(3, 4), (3, 6), (5, 6)],
768 );
769
770 check_reduce_runs(
774 &[
775 (10, 20),
776 (30, 40),
777 (50, 60),
778 (50, 80),
779 (80, 90),
780 (80, 100),
781 (100, 110),
782 ],
783 &[
784 vec![(10, 20), (30, 40), (50, 80), (80, 100), (100, 110)],
785 vec![(50, 60), (80, 90)],
786 ],
787 &[(50, 80), (80, 100), (50, 60), (80, 90)],
788 );
789
790 check_reduce_runs(
795 &[(0, 10), (0, 11), (0, 12), (0, 13)],
796 &[vec![(0, 13)], vec![(0, 12)], vec![(0, 11)], vec![(0, 10)]],
797 &[(0, 10), (0, 11)],
798 );
799 }
800
801 #[test]
802 fn test_find_overlapping_items() {
803 let mut result = Vec::new();
804
805 find_overlapping_items(
807 &mut SortedRun::from(Vec::<MockFile>::new()),
808 &mut SortedRun::from(Vec::<MockFile>::new()),
809 &mut result,
810 );
811 assert_eq!(result, Vec::<MockFile>::new());
812
813 let files1 = build_items(&[(1, 3)]);
814 find_overlapping_items(
815 &mut SortedRun::from(files1.clone()),
816 &mut SortedRun::from(Vec::<MockFile>::new()),
817 &mut result,
818 );
819 assert_eq!(result, Vec::<MockFile>::new());
820
821 find_overlapping_items(
822 &mut SortedRun::from(Vec::<MockFile>::new()),
823 &mut SortedRun::from(files1.clone()),
824 &mut result,
825 );
826 assert_eq!(result, Vec::<MockFile>::new());
827
828 let files1 = build_items(&[(1, 3), (5, 7)]);
830 let files2 = build_items(&[(10, 12), (15, 20)]);
831 find_overlapping_items(
832 &mut SortedRun::from(files1),
833 &mut SortedRun::from(files2),
834 &mut result,
835 );
836 assert_eq!(result, Vec::<MockFile>::new());
837
838 let files1 = build_items(&[(1, 5)]);
840 let files2 = build_items(&[(3, 7)]);
841 find_overlapping_items(
842 &mut SortedRun::from(files1),
843 &mut SortedRun::from(files2),
844 &mut result,
845 );
846 assert_eq!(result.len(), 2);
847 assert_eq!(result[0].range(), (1, 5));
848 assert_eq!(result[1].range(), (3, 7));
849
850 let files1 = build_items(&[(1, 5), (8, 12), (15, 20)]);
852 let files2 = build_items(&[(3, 6), (7, 10), (18, 25)]);
853 find_overlapping_items(
854 &mut SortedRun::from(files1),
855 &mut SortedRun::from(files2),
856 &mut result,
857 );
858 assert_eq!(result.len(), 6);
859
860 let files1 = build_items(&[(1, 5)]);
862 let files2 = build_items(&[(5, 10)]); find_overlapping_items(
864 &mut SortedRun::from(files1),
865 &mut SortedRun::from(files2),
866 &mut result,
867 );
868 assert_eq!(result.len(), 2); let files1 = build_items(&[(1, 10)]);
872 let files2 = build_items(&[(3, 7)]);
873 find_overlapping_items(
874 &mut SortedRun::from(files1),
875 &mut SortedRun::from(files2),
876 &mut result,
877 );
878 assert_eq!(result.len(), 2);
879
880 let files1 = build_items(&[(1, 5)]);
882 let files2 = build_items(&[(1, 5)]);
883 find_overlapping_items(
884 &mut SortedRun::from(files1),
885 &mut SortedRun::from(files2),
886 &mut result,
887 );
888 assert_eq!(result.len(), 2);
889
890 let files1 = build_items(&[(5, 10), (1, 3)]); let files2 = build_items(&[(2, 7), (8, 12)]); find_overlapping_items(
894 &mut SortedRun::from(files1),
895 &mut SortedRun::from(files2),
896 &mut result,
897 );
898 assert_eq!(result.len(), 4); }
900
901 #[test]
902 fn test_file_group_overlap_time_overlap_pk_disjoint() {
903 let lhs =
904 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
905 FileId::random(),
906 0,
907 100,
908 0,
909 1,
910 10,
911 pk_range(b"a", b"f"),
912 ));
913 let rhs =
914 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
915 FileId::random(),
916 50,
917 150,
918 0,
919 2,
920 10,
921 pk_range(b"x", b"z"),
922 ));
923
924 assert!(!lhs.overlap(&rhs));
925 }
926
927 #[test]
928 fn test_find_sorted_runs_collapses_pk_disjoint_files_into_one_run() {
929 let mut files = vec![
930 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
931 FileId::random(),
932 0,
933 100,
934 0,
935 1,
936 10,
937 pk_range(b"a", b"f"),
938 )),
939 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
940 FileId::random(),
941 50,
942 150,
943 0,
944 2,
945 10,
946 pk_range(b"x", b"z"),
947 )),
948 ];
949
950 let runs = find_sorted_runs(&mut files);
951
952 assert_eq!(1, runs.len());
953 assert_eq!(2, runs[0].items().len());
954 }
955
956 #[test]
957 fn test_find_sorted_runs_handles_2d_transitivity_break() {
958 let mut files = vec![
959 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
960 FileId::random(),
961 0,
962 100,
963 0,
964 1,
965 10,
966 pk_range(b"a", b"f"),
967 )),
968 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
969 FileId::random(),
970 50,
971 150,
972 0,
973 2,
974 10,
975 pk_range(b"x", b"z"),
976 )),
977 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
978 FileId::random(),
979 50,
980 150,
981 0,
982 3,
983 10,
984 pk_range(b"a", b"f"),
985 )),
986 ];
987
988 let runs = find_sorted_runs(&mut files);
989
990 assert_eq!(2, runs.len());
991 assert_eq!(2, runs[0].items().len());
992 assert_eq!(1, runs[1].items().len());
993 }
994
995 #[test]
996 fn test_find_overlapping_items_skips_pk_disjoint_pairs() {
997 let mut left = SortedRun::from(vec![FileGroup::new_with_file(
998 new_file_handle_with_size_sequence_and_primary_key_range(
999 FileId::random(),
1000 0,
1001 100,
1002 0,
1003 1,
1004 10,
1005 pk_range(b"a", b"f"),
1006 ),
1007 )]);
1008 let mut right = SortedRun::from(vec![FileGroup::new_with_file(
1009 new_file_handle_with_size_sequence_and_primary_key_range(
1010 FileId::random(),
1011 50,
1012 150,
1013 0,
1014 2,
1015 10,
1016 pk_range(b"x", b"z"),
1017 ),
1018 )]);
1019 let mut result = Vec::new();
1020
1021 find_overlapping_items(&mut left, &mut right, &mut result);
1022
1023 assert!(result.is_empty());
1024 }
1025
1026 #[test]
1027 fn test_file_group_touching_time_boundary_with_same_pk_is_not_overlap() {
1028 let lhs =
1029 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
1030 FileId::random(),
1031 0,
1032 100,
1033 0,
1034 1,
1035 10,
1036 pk_range(b"a", b"f"),
1037 ));
1038 let rhs =
1039 FileGroup::new_with_file(new_file_handle_with_size_sequence_and_primary_key_range(
1040 FileId::random(),
1041 100,
1042 150,
1043 0,
1044 2,
1045 10,
1046 pk_range(b"a", b"f"),
1047 ));
1048
1049 assert!(!lhs.overlap(&rhs));
1050 }
1051
1052 #[test]
1053 fn test_merge_seq_files() {
1054 let files = Vec::<MockFile>::new();
1056 assert_eq!(merge_seq_files(&files, None), Vec::<MockFile>::new());
1057
1058 let files = build_items(&[(1, 5)]);
1060 assert_eq!(merge_seq_files(&files, None), Vec::<MockFile>::new());
1061
1062 let files = build_items_with_size(&[(1, 2, 10), (3, 4, 1), (5, 6, 1), (7, 8, 1)]);
1064 let result = merge_seq_files(&files, None);
1065 assert_eq!(result.len(), 3);
1066 assert_eq!(result[0].size, 1);
1067 assert_eq!(result[1].size, 1);
1068 assert_eq!(result[2].size, 1);
1069
1070 let files = build_items_with_size(&[(1, 2, 5), (3, 4, 5), (5, 6, 5), (7, 8, 5)]);
1072 let result = merge_seq_files(&files, Some(20));
1073 assert_eq!(result.len(), 4); let files = build_items_with_size(&[(1, 2, 5), (3, 4, 5), (5, 6, 5), (7, 8, 5)]);
1077 let result = merge_seq_files(&files, Some(10));
1078 assert_eq!(result.len(), 2); let files = build_items_with_size(&[(1, 2, 2), (3, 4, 3), (5, 6, 4), (7, 8, 10)]);
1082 let result = merge_seq_files(&files, Some(10));
1083 assert_eq!(result.len(), 3); let files =
1088 build_items_with_size(&[(1, 2, 5), (3, 4, 5), (5, 6, 10), (7, 8, 1), (9, 10, 1)]);
1089 let result = merge_seq_files(&files, Some(12));
1090 assert_eq!(result.len(), 2);
1091 assert_eq!(result[0].size, 5);
1092 assert_eq!(result[1].size, 5);
1093
1094 let files = build_items_with_size(&[(1, 2, 100), (3, 4, 1), (5, 6, 1), (7, 8, 1)]);
1096 let result = merge_seq_files(&files, Some(10));
1097 assert_eq!(result.len(), 3); assert_eq!(result[0].size, 1);
1099 assert_eq!(result[1].size, 1);
1100 assert_eq!(result[2].size, 1);
1101
1102 let files = build_items_with_size(&[(1, 2, 100), (3, 4, 20), (5, 6, 20), (7, 8, 20)]);
1103 let result = merge_seq_files(&files, Some(200));
1104 assert_eq!(result.len(), 4);
1105
1106 let files = build_items_with_size(&[(1, 2, 160), (3, 4, 20), (5, 6, 20), (7, 8, 20)]);
1107 let result = merge_seq_files(&files, None);
1108 assert_eq!(result.len(), 3);
1109 assert_eq!(result[0].size, 20);
1110 assert_eq!(result[1].size, 20);
1111 assert_eq!(result[2].size, 20);
1112
1113 let files = build_items_with_size(&[(1, 2, 100), (3, 4, 1)]);
1114 let result = merge_seq_files(&files, Some(200));
1115 assert_eq!(result.len(), 2);
1116 assert_eq!(result[0].size, 100);
1117 assert_eq!(result[1].size, 1);
1118
1119 let files = build_items_with_size(&[(1, 2, 20), (3, 4, 20), (5, 6, 20), (7, 8, 20)]);
1120 let result = merge_seq_files(&files, Some(40));
1121 assert_eq!(result.len(), 2);
1122 assert_eq!(result[0].start, 1);
1123 assert_eq!(result[1].start, 3);
1124 }
1125}