Skip to main content

mito2/memtable/
bulk.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Memtable implementation for bulk load
16
17pub(crate) mod chunk_reader;
18#[allow(unused)]
19pub mod context;
20#[allow(unused)]
21pub mod part;
22pub mod part_reader;
23mod row_group_reader;
24
25use std::collections::{BTreeMap, HashSet};
26use std::sync::atomic::{AtomicI64, AtomicU64, AtomicUsize, Ordering};
27use std::sync::{Arc, LazyLock, Mutex, RwLock};
28use std::time::Instant;
29
30/// Reads an environment variable as usize, returning default if not set or invalid.
31fn env_usize(name: &str, default: usize) -> usize {
32    std::env::var(name)
33        .ok()
34        .and_then(|v| v.parse().ok())
35        .unwrap_or(default)
36}
37
38use common_time::Timestamp;
39use datatypes::arrow::datatypes::SchemaRef;
40use mito_codec::key_values::KeyValue;
41use rayon::prelude::*;
42use store_api::metadata::RegionMetadataRef;
43use store_api::storage::{ColumnId, FileId, RegionId, SequenceRange};
44use tokio::sync::Semaphore;
45
46use crate::error::{Result, UnsupportedOperationSnafu};
47use crate::flush::WriteBufferManagerRef;
48use crate::memtable::bulk::context::BulkIterContext;
49use crate::memtable::bulk::part::{
50    BulkPart, BulkPartEncodeMetrics, BulkPartEncoder, MultiBulkPart, UnorderedPart,
51};
52use crate::memtable::bulk::part_reader::BulkPartBatchIter;
53use crate::memtable::stats::WriteMetrics;
54use crate::memtable::{
55    AllocTracker, BoxedBatchIterator, BoxedRecordBatchIterator, EncodedBulkPart, EncodedRange,
56    IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange,
57    MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions,
58};
59use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow};
60use crate::read::flat_merge::FlatMergeIterator;
61use crate::region::options::MergeMode;
62use crate::sst::parquet::flat_format::field_column_start;
63use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE};
64use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
65
66/// Default merge threshold for triggering compaction.
67const DEFAULT_MERGE_THRESHOLD: usize = 16;
68
69/// Threshold for triggering merge of parts. Configurable via `GREPTIME_BULK_MERGE_THRESHOLD`.
70static MERGE_THRESHOLD: LazyLock<usize> =
71    LazyLock::new(|| env_usize("GREPTIME_BULK_MERGE_THRESHOLD", DEFAULT_MERGE_THRESHOLD));
72
73/// Default maximum number of groups for parallel merging.
74const DEFAULT_MAX_MERGE_GROUPS: usize = 32;
75
76/// Maximum merge groups. Configurable via `GREPTIME_BULK_MAX_MERGE_GROUPS`.
77static MAX_MERGE_GROUPS: LazyLock<usize> =
78    LazyLock::new(|| env_usize("GREPTIME_BULK_MAX_MERGE_GROUPS", DEFAULT_MAX_MERGE_GROUPS));
79
80/// Row threshold for encoding parts. Configurable via `GREPTIME_BULK_ENCODE_ROW_THRESHOLD`.
81/// When estimated rows exceed this threshold, parts are encoded as EncodedBulkPart.
82pub(crate) static ENCODE_ROW_THRESHOLD: LazyLock<usize> = LazyLock::new(|| {
83    env_usize(
84        "GREPTIME_BULK_ENCODE_ROW_THRESHOLD",
85        10 * DEFAULT_ROW_GROUP_SIZE,
86    )
87});
88
89/// Default bytes threshold for encoding.
90const DEFAULT_ENCODE_BYTES_THRESHOLD: usize = 64 * 1024 * 1024;
91
92/// Bytes threshold for encoding parts. Configurable via `GREPTIME_BULK_ENCODE_BYTES_THRESHOLD`.
93/// When estimated bytes exceed this threshold, parts are encoded as EncodedBulkPart.
94static ENCODE_BYTES_THRESHOLD: LazyLock<usize> = LazyLock::new(|| {
95    env_usize(
96        "GREPTIME_BULK_ENCODE_BYTES_THRESHOLD",
97        DEFAULT_ENCODE_BYTES_THRESHOLD,
98    )
99});
100
101/// Configuration for bulk memtable.
102#[derive(Debug, Clone)]
103pub struct BulkMemtableConfig {
104    /// Threshold for triggering merge of parts.
105    pub merge_threshold: usize,
106    /// Row threshold for encoding parts.
107    pub encode_row_threshold: usize,
108    /// Bytes threshold for encoding parts.
109    pub encode_bytes_threshold: usize,
110    /// Maximum number of groups for parallel merging.
111    pub max_merge_groups: usize,
112}
113
114impl Default for BulkMemtableConfig {
115    fn default() -> Self {
116        Self {
117            merge_threshold: *MERGE_THRESHOLD,
118            encode_row_threshold: *ENCODE_ROW_THRESHOLD,
119            encode_bytes_threshold: *ENCODE_BYTES_THRESHOLD,
120            max_merge_groups: *MAX_MERGE_GROUPS,
121        }
122    }
123}
124
125/// Result of merging parts - either a MultiBulkPart or an EncodedBulkPart
126enum MergedPart {
127    /// Merged part stored as MultiBulkPart (when rows < DEFAULT_ROW_GROUP_SIZE)
128    Multi(MultiBulkPart),
129    /// Merged part stored as EncodedBulkPart (when rows >= DEFAULT_ROW_GROUP_SIZE)
130    Encoded(EncodedBulkPart),
131}
132
133/// Result of collecting parts to merge
134struct CollectedParts {
135    /// Groups of parts ready for merging (each group has up to 16 parts)
136    groups: Vec<Vec<PartToMerge>>,
137}
138
139/// All parts in a bulk memtable.
140#[derive(Default)]
141struct BulkParts {
142    /// Unordered small parts (< 1024 rows).
143    unordered_part: UnorderedPart,
144    /// All parts (raw and encoded).
145    parts: Vec<BulkPartWrapper>,
146}
147
148impl BulkParts {
149    /// Total number of parts (including unordered).
150    fn num_parts(&self) -> usize {
151        let unordered_count = if self.unordered_part.is_empty() { 0 } else { 1 };
152        self.parts.len() + unordered_count
153    }
154
155    /// Returns true if there is no part.
156    fn is_empty(&self) -> bool {
157        self.unordered_part.is_empty() && self.parts.is_empty()
158    }
159
160    /// Returns true if bulk parts or encoded parts should be merged.
161    /// Uses short-circuit counting to stop early once threshold is reached.
162    fn should_merge_parts(&self, merge_threshold: usize) -> bool {
163        let mut bulk_count = 0;
164        let mut encoded_count = 0;
165
166        for wrapper in &self.parts {
167            if wrapper.merging {
168                continue;
169            }
170
171            if wrapper.part.is_encoded() {
172                encoded_count += 1;
173            } else {
174                bulk_count += 1;
175            }
176
177            // Short-circuit: stop counting if either threshold is reached
178            if bulk_count >= merge_threshold || encoded_count >= merge_threshold {
179                return true;
180            }
181        }
182
183        false
184    }
185
186    /// Returns true if the unordered_part should be compacted into a BulkPart.
187    fn should_compact_unordered_part(&self) -> bool {
188        self.unordered_part.should_compact()
189    }
190
191    /// Collects unmerged parts and marks them as being merged.
192    /// Only collects parts of types that meet the threshold.
193    /// Parts are pre-grouped into chunks for parallel processing.
194    fn collect_parts_to_merge(
195        &mut self,
196        merge_threshold: usize,
197        max_merge_groups: usize,
198    ) -> CollectedParts {
199        // First pass: collect indices and row counts for each type
200        let mut bulk_indices: Vec<(usize, usize)> = Vec::new();
201        let mut encoded_indices: Vec<(usize, usize)> = Vec::new();
202
203        for (idx, wrapper) in self.parts.iter().enumerate() {
204            if wrapper.merging {
205                continue;
206            }
207            let num_rows = wrapper.part.num_rows();
208            if wrapper.part.is_encoded() {
209                encoded_indices.push((idx, num_rows));
210            } else {
211                bulk_indices.push((idx, num_rows));
212            }
213        }
214
215        let mut groups = Vec::new();
216
217        // Process bulk parts if threshold met
218        if bulk_indices.len() >= merge_threshold {
219            groups.extend(self.collect_and_group_parts(
220                bulk_indices,
221                merge_threshold,
222                max_merge_groups,
223            ));
224        }
225
226        // Process encoded parts if threshold met
227        if encoded_indices.len() >= merge_threshold {
228            groups.extend(self.collect_and_group_parts(
229                encoded_indices,
230                merge_threshold,
231                max_merge_groups,
232            ));
233        }
234
235        CollectedParts { groups }
236    }
237
238    /// Sorts indices by row count, groups into chunks, marks as merging, and returns groups.
239    fn collect_and_group_parts(
240        &mut self,
241        mut indices: Vec<(usize, usize)>,
242        merge_threshold: usize,
243        max_merge_groups: usize,
244    ) -> Vec<Vec<PartToMerge>> {
245        if indices.is_empty() {
246            return Vec::new();
247        }
248
249        // Sort by row count for better grouping
250        indices.sort_unstable_by_key(|(_, num_rows)| *num_rows);
251
252        // Group into chunks of merge_threshold size, limit to max_merge_groups
253        indices
254            .chunks(merge_threshold)
255            .take(max_merge_groups)
256            .map(|chunk| {
257                chunk
258                    .iter()
259                    .map(|(idx, _)| {
260                        let wrapper = &mut self.parts[*idx];
261                        wrapper.merging = true;
262                        wrapper.part.clone()
263                    })
264                    .collect()
265            })
266            .collect()
267    }
268
269    /// Installs merged parts and removes the original parts by file ids.
270    /// Returns the total number of rows in the merged parts.
271    fn install_merged_parts<I>(
272        &mut self,
273        merged_parts: I,
274        merged_file_ids: &HashSet<FileId>,
275    ) -> usize
276    where
277        I: IntoIterator<Item = MergedPart>,
278    {
279        let mut total_output_rows = 0;
280
281        for merged_part in merged_parts {
282            match merged_part {
283                MergedPart::Encoded(encoded_part) => {
284                    total_output_rows += encoded_part.metadata().num_rows;
285                    self.parts.push(BulkPartWrapper {
286                        part: PartToMerge::Encoded {
287                            part: encoded_part,
288                            file_id: FileId::random(),
289                        },
290                        merging: false,
291                    });
292                }
293                MergedPart::Multi(multi_part) => {
294                    total_output_rows += multi_part.num_rows();
295                    self.parts.push(BulkPartWrapper {
296                        part: PartToMerge::Multi {
297                            part: multi_part,
298                            file_id: FileId::random(),
299                        },
300                        merging: false,
301                    });
302                }
303            }
304        }
305
306        self.parts
307            .retain(|wrapper| !merged_file_ids.contains(&wrapper.file_id()));
308
309        total_output_rows
310    }
311
312    /// Resets merging flag for parts with the given file ids.
313    /// Used when merging fails or is cancelled.
314    fn reset_merging_flags(&mut self, file_ids: &HashSet<FileId>) {
315        for wrapper in &mut self.parts {
316            if file_ids.contains(&wrapper.file_id()) {
317                wrapper.merging = false;
318            }
319        }
320    }
321}
322
323/// RAII guard for managing merging flags.
324/// Automatically resets merging flags when dropped if the merge operation wasn't successful.
325struct MergingFlagsGuard<'a> {
326    bulk_parts: &'a RwLock<BulkParts>,
327    file_ids: &'a HashSet<FileId>,
328    success: bool,
329}
330
331impl<'a> MergingFlagsGuard<'a> {
332    /// Creates a new guard for the given file ids.
333    fn new(bulk_parts: &'a RwLock<BulkParts>, file_ids: &'a HashSet<FileId>) -> Self {
334        Self {
335            bulk_parts,
336            file_ids,
337            success: false,
338        }
339    }
340
341    /// Marks the merge operation as successful.
342    /// When this is called, the guard will not reset the flags on drop.
343    fn mark_success(&mut self) {
344        self.success = true;
345    }
346}
347
348impl<'a> Drop for MergingFlagsGuard<'a> {
349    fn drop(&mut self) {
350        if !self.success
351            && let Ok(mut parts) = self.bulk_parts.write()
352        {
353            parts.reset_merging_flags(self.file_ids);
354        }
355    }
356}
357
358/// Memtable that ingests and scans parts directly.
359pub struct BulkMemtable {
360    id: MemtableId,
361    /// Configuration for the bulk memtable.
362    config: BulkMemtableConfig,
363    parts: Arc<RwLock<BulkParts>>,
364    metadata: RegionMetadataRef,
365    alloc_tracker: AllocTracker,
366    max_timestamp: AtomicI64,
367    min_timestamp: AtomicI64,
368    max_sequence: AtomicU64,
369    num_rows: AtomicUsize,
370    /// Cached flat SST arrow schema for memtable compaction.
371    flat_arrow_schema: SchemaRef,
372    /// Compactor for merging bulk parts
373    compactor: Arc<Mutex<MemtableCompactor>>,
374    /// Dispatcher for scheduling compaction tasks
375    compact_dispatcher: Option<Arc<CompactDispatcher>>,
376    /// Whether the append mode is enabled
377    append_mode: bool,
378    /// Mode to handle duplicate rows while merging
379    merge_mode: MergeMode,
380}
381
382impl std::fmt::Debug for BulkMemtable {
383    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
384        f.debug_struct("BulkMemtable")
385            .field("id", &self.id)
386            .field("num_rows", &self.num_rows.load(Ordering::Relaxed))
387            .field("min_timestamp", &self.min_timestamp.load(Ordering::Relaxed))
388            .field("max_timestamp", &self.max_timestamp.load(Ordering::Relaxed))
389            .field("max_sequence", &self.max_sequence.load(Ordering::Relaxed))
390            .finish()
391    }
392}
393
394impl Memtable for BulkMemtable {
395    fn id(&self) -> MemtableId {
396        self.id
397    }
398
399    fn write(&self, _kvs: &KeyValues) -> Result<()> {
400        UnsupportedOperationSnafu {
401            err_msg: "write() is not supported for bulk memtable",
402        }
403        .fail()
404    }
405
406    fn write_one(&self, _key_value: KeyValue) -> Result<()> {
407        UnsupportedOperationSnafu {
408            err_msg: "write_one() is not supported for bulk memtable",
409        }
410        .fail()
411    }
412
413    fn write_bulk(&self, fragment: BulkPart) -> Result<()> {
414        let local_metrics = WriteMetrics {
415            key_bytes: 0,
416            value_bytes: fragment.estimated_size(),
417            min_ts: fragment.min_timestamp,
418            max_ts: fragment.max_timestamp,
419            num_rows: fragment.num_rows(),
420            max_sequence: fragment.sequence,
421        };
422
423        {
424            let mut bulk_parts = self.parts.write().unwrap();
425
426            // Routes small parts to unordered_part based on threshold
427            if bulk_parts.unordered_part.should_accept(fragment.num_rows()) {
428                bulk_parts.unordered_part.push(fragment);
429
430                // Compacts unordered_part if threshold is reached
431                if bulk_parts.should_compact_unordered_part()
432                    && let Some(bulk_part) = bulk_parts.unordered_part.to_bulk_part()?
433                {
434                    bulk_parts.parts.push(BulkPartWrapper {
435                        part: PartToMerge::Bulk {
436                            part: bulk_part,
437                            file_id: FileId::random(),
438                        },
439                        merging: false,
440                    });
441                    bulk_parts.unordered_part.clear();
442                }
443            } else {
444                bulk_parts.parts.push(BulkPartWrapper {
445                    part: PartToMerge::Bulk {
446                        part: fragment,
447                        file_id: FileId::random(),
448                    },
449                    merging: false,
450                });
451            }
452
453            // Since this operation should be fast, we do it in parts lock scope.
454            // This ensure the statistics in `ranges()` are correct. What's more,
455            // it guarantees no rows are out of the time range so we don't need to
456            // prune rows by time range again in the iterator of the MemtableRange.
457            self.update_stats(local_metrics);
458        }
459
460        if self.should_compact() {
461            self.schedule_compact();
462        }
463
464        Ok(())
465    }
466
467    fn ranges(
468        &self,
469        projection: Option<&[ColumnId]>,
470        options: RangesOptions,
471    ) -> Result<MemtableRanges> {
472        let predicate = options.predicate;
473        let sequence = options.sequence;
474        let mut ranges = BTreeMap::new();
475        let mut range_id = 0;
476
477        // TODO(yingwen): Filter ranges by sequence.
478        let context = Arc::new(BulkIterContext::new_with_pre_filter_mode(
479            self.metadata.clone(),
480            projection,
481            predicate.predicate().cloned(),
482            options.for_flush,
483            options.pre_filter_mode,
484        )?);
485
486        // Adds ranges for regular parts and encoded parts
487        {
488            let bulk_parts = self.parts.read().unwrap();
489
490            // Adds range for unordered part if not empty
491            if !bulk_parts.unordered_part.is_empty()
492                && let Some(unordered_bulk_part) = bulk_parts.unordered_part.to_bulk_part()?
493            {
494                let part_stats = unordered_bulk_part.to_memtable_stats(&self.metadata);
495                let range = MemtableRange::new(
496                    Arc::new(MemtableRangeContext::new(
497                        self.id,
498                        Box::new(BulkRangeIterBuilder {
499                            part: unordered_bulk_part,
500                            context: context.clone(),
501                            sequence,
502                        }),
503                        predicate.clone(),
504                    )),
505                    part_stats,
506                );
507                ranges.insert(range_id, range);
508                range_id += 1;
509            }
510
511            // Adds ranges for all parts
512            for part_wrapper in bulk_parts.parts.iter() {
513                // Skips empty parts
514                if part_wrapper.part.num_rows() == 0 {
515                    continue;
516                }
517
518                let part_stats = part_wrapper.part.to_memtable_stats(&self.metadata);
519                let iter_builder: Box<dyn IterBuilder> = match &part_wrapper.part {
520                    PartToMerge::Bulk { part, .. } => Box::new(BulkRangeIterBuilder {
521                        part: part.clone(),
522                        context: context.clone(),
523                        sequence,
524                    }),
525                    PartToMerge::Multi { part, .. } => Box::new(MultiBulkRangeIterBuilder {
526                        part: part.clone(),
527                        context: context.clone(),
528                        sequence,
529                    }),
530                    PartToMerge::Encoded { part, file_id } => {
531                        Box::new(EncodedBulkRangeIterBuilder {
532                            file_id: *file_id,
533                            part: part.clone(),
534                            context: context.clone(),
535                            sequence,
536                        })
537                    }
538                };
539
540                let range = MemtableRange::new(
541                    Arc::new(MemtableRangeContext::new(
542                        self.id,
543                        iter_builder,
544                        predicate.clone(),
545                    )),
546                    part_stats,
547                );
548                ranges.insert(range_id, range);
549                range_id += 1;
550            }
551        }
552
553        Ok(MemtableRanges { ranges })
554    }
555
556    fn is_empty(&self) -> bool {
557        let bulk_parts = self.parts.read().unwrap();
558        bulk_parts.is_empty()
559    }
560
561    fn freeze(&self) -> Result<()> {
562        self.alloc_tracker.done_allocating();
563        Ok(())
564    }
565
566    fn stats(&self) -> MemtableStats {
567        let estimated_bytes = self.alloc_tracker.bytes_allocated();
568
569        if estimated_bytes == 0 || self.num_rows.load(Ordering::Relaxed) == 0 {
570            return MemtableStats {
571                estimated_bytes,
572                time_range: None,
573                num_rows: 0,
574                num_ranges: 0,
575                max_sequence: 0,
576                series_count: 0,
577            };
578        }
579
580        let ts_type = self
581            .metadata
582            .time_index_column()
583            .column_schema
584            .data_type
585            .clone()
586            .as_timestamp()
587            .expect("Timestamp column must have timestamp type");
588        let max_timestamp = ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
589        let min_timestamp = ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
590
591        let num_ranges = self.parts.read().unwrap().num_parts();
592
593        MemtableStats {
594            estimated_bytes,
595            time_range: Some((min_timestamp, max_timestamp)),
596            num_rows: self.num_rows.load(Ordering::Relaxed),
597            num_ranges,
598            max_sequence: self.max_sequence.load(Ordering::Relaxed),
599            series_count: self.estimated_series_count(),
600        }
601    }
602
603    fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
604        // Computes the new flat schema based on the new metadata.
605        let flat_arrow_schema = to_flat_sst_arrow_schema(
606            metadata,
607            &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding),
608        );
609
610        Arc::new(Self {
611            id,
612            config: self.config.clone(),
613            parts: Arc::new(RwLock::new(BulkParts::default())),
614            metadata: metadata.clone(),
615            alloc_tracker: AllocTracker::new(self.alloc_tracker.write_buffer_manager()),
616            max_timestamp: AtomicI64::new(i64::MIN),
617            min_timestamp: AtomicI64::new(i64::MAX),
618            max_sequence: AtomicU64::new(0),
619            num_rows: AtomicUsize::new(0),
620            flat_arrow_schema,
621            compactor: Arc::new(Mutex::new(MemtableCompactor::new(
622                metadata.region_id,
623                id,
624                self.config.clone(),
625            ))),
626            compact_dispatcher: self.compact_dispatcher.clone(),
627            append_mode: self.append_mode,
628            merge_mode: self.merge_mode,
629        })
630    }
631
632    fn compact(&self, for_flush: bool) -> Result<()> {
633        let mut compactor = self.compactor.lock().unwrap();
634
635        if for_flush {
636            return Ok(());
637        }
638
639        // Unified merge for all parts
640        let should_merge = self
641            .parts
642            .read()
643            .unwrap()
644            .should_merge_parts(self.config.merge_threshold);
645        if should_merge {
646            compactor.merge_parts(
647                &self.flat_arrow_schema,
648                &self.parts,
649                &self.metadata,
650                !self.append_mode,
651                self.merge_mode,
652            )?;
653        }
654
655        Ok(())
656    }
657}
658
659impl BulkMemtable {
660    /// Creates a new BulkMemtable
661    pub fn new(
662        id: MemtableId,
663        config: BulkMemtableConfig,
664        metadata: RegionMetadataRef,
665        write_buffer_manager: Option<WriteBufferManagerRef>,
666        compact_dispatcher: Option<Arc<CompactDispatcher>>,
667        append_mode: bool,
668        merge_mode: MergeMode,
669    ) -> Self {
670        let flat_arrow_schema = to_flat_sst_arrow_schema(
671            &metadata,
672            &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding),
673        );
674
675        let region_id = metadata.region_id;
676        Self {
677            id,
678            config: config.clone(),
679            parts: Arc::new(RwLock::new(BulkParts::default())),
680            metadata,
681            alloc_tracker: AllocTracker::new(write_buffer_manager),
682            max_timestamp: AtomicI64::new(i64::MIN),
683            min_timestamp: AtomicI64::new(i64::MAX),
684            max_sequence: AtomicU64::new(0),
685            num_rows: AtomicUsize::new(0),
686            flat_arrow_schema,
687            compactor: Arc::new(Mutex::new(MemtableCompactor::new(region_id, id, config))),
688            compact_dispatcher,
689            append_mode,
690            merge_mode,
691        }
692    }
693
694    /// Sets the unordered part threshold (for testing).
695    #[cfg(test)]
696    pub fn set_unordered_part_threshold(&self, threshold: usize) {
697        self.parts
698            .write()
699            .unwrap()
700            .unordered_part
701            .set_threshold(threshold);
702    }
703
704    /// Sets the unordered part compact threshold (for testing).
705    #[cfg(test)]
706    pub fn set_unordered_part_compact_threshold(&self, compact_threshold: usize) {
707        self.parts
708            .write()
709            .unwrap()
710            .unordered_part
711            .set_compact_threshold(compact_threshold);
712    }
713
714    /// Updates memtable stats.
715    ///
716    /// Please update this inside the write lock scope.
717    fn update_stats(&self, stats: WriteMetrics) {
718        self.alloc_tracker
719            .on_allocation(stats.key_bytes + stats.value_bytes);
720
721        self.max_timestamp
722            .fetch_max(stats.max_ts, Ordering::Relaxed);
723        self.min_timestamp
724            .fetch_min(stats.min_ts, Ordering::Relaxed);
725        self.max_sequence
726            .fetch_max(stats.max_sequence, Ordering::Relaxed);
727        self.num_rows.fetch_add(stats.num_rows, Ordering::Relaxed);
728    }
729
730    /// Returns the estimated time series count.
731    fn estimated_series_count(&self) -> usize {
732        let bulk_parts = self.parts.read().unwrap();
733        bulk_parts
734            .parts
735            .iter()
736            .map(|part_wrapper| part_wrapper.part.series_count())
737            .sum()
738    }
739
740    /// Returns whether the memtable should be compacted.
741    fn should_compact(&self) -> bool {
742        let parts = self.parts.read().unwrap();
743        parts.should_merge_parts(self.config.merge_threshold)
744    }
745
746    /// Schedules a compaction task using the CompactDispatcher.
747    fn schedule_compact(&self) {
748        if let Some(dispatcher) = &self.compact_dispatcher {
749            let task = MemCompactTask {
750                metadata: self.metadata.clone(),
751                parts: self.parts.clone(),
752                config: self.config.clone(),
753                flat_arrow_schema: self.flat_arrow_schema.clone(),
754                compactor: self.compactor.clone(),
755                append_mode: self.append_mode,
756                merge_mode: self.merge_mode,
757            };
758
759            dispatcher.dispatch_compact(task);
760        } else {
761            // Uses synchronous compaction if no dispatcher is available.
762            if let Err(e) = self.compact(false) {
763                common_telemetry::error!(e; "Failed to compact table");
764            }
765        }
766    }
767}
768
769/// Iterator builder for bulk range
770pub struct BulkRangeIterBuilder {
771    pub part: BulkPart,
772    pub context: Arc<BulkIterContext>,
773    pub sequence: Option<SequenceRange>,
774}
775
776/// Iterator builder for multi bulk range
777struct MultiBulkRangeIterBuilder {
778    part: MultiBulkPart,
779    context: Arc<BulkIterContext>,
780    sequence: Option<SequenceRange>,
781}
782
783impl IterBuilder for BulkRangeIterBuilder {
784    fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
785        UnsupportedOperationSnafu {
786            err_msg: "BatchIterator is not supported for bulk memtable",
787        }
788        .fail()
789    }
790
791    fn is_record_batch(&self) -> bool {
792        true
793    }
794
795    fn build_record_batch(
796        &self,
797        _time_range: Option<(Timestamp, Timestamp)>,
798        metrics: Option<MemScanMetrics>,
799    ) -> Result<BoxedRecordBatchIterator> {
800        let series_count = self.part.estimated_series_count();
801        let iter = BulkPartBatchIter::from_single(
802            self.part.batch.clone(),
803            self.context.clone(),
804            self.sequence,
805            series_count,
806            metrics,
807        );
808
809        Ok(Box::new(iter))
810    }
811
812    fn encoded_range(&self) -> Option<EncodedRange> {
813        None
814    }
815}
816
817impl IterBuilder for MultiBulkRangeIterBuilder {
818    fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
819        UnsupportedOperationSnafu {
820            err_msg: "BatchIterator is not supported for multi bulk memtable",
821        }
822        .fail()
823    }
824
825    fn is_record_batch(&self) -> bool {
826        true
827    }
828
829    fn build_record_batch(
830        &self,
831        _time_range: Option<(Timestamp, Timestamp)>,
832        metrics: Option<MemScanMetrics>,
833    ) -> Result<BoxedRecordBatchIterator> {
834        self.part
835            .read(self.context.clone(), self.sequence, metrics)?
836            .ok_or_else(|| {
837                UnsupportedOperationSnafu {
838                    err_msg: "Failed to create iterator for multi bulk part",
839                }
840                .build()
841            })
842    }
843
844    fn encoded_range(&self) -> Option<EncodedRange> {
845        None
846    }
847}
848
849/// Iterator builder for encoded bulk range
850struct EncodedBulkRangeIterBuilder {
851    file_id: FileId,
852    part: EncodedBulkPart,
853    context: Arc<BulkIterContext>,
854    sequence: Option<SequenceRange>,
855}
856
857impl IterBuilder for EncodedBulkRangeIterBuilder {
858    fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
859        UnsupportedOperationSnafu {
860            err_msg: "BatchIterator is not supported for encoded bulk memtable",
861        }
862        .fail()
863    }
864
865    fn is_record_batch(&self) -> bool {
866        true
867    }
868
869    fn build_record_batch(
870        &self,
871        _time_range: Option<(Timestamp, Timestamp)>,
872        metrics: Option<MemScanMetrics>,
873    ) -> Result<BoxedRecordBatchIterator> {
874        if let Some(iter) = self
875            .part
876            .read(self.context.clone(), self.sequence, metrics)?
877        {
878            Ok(iter)
879        } else {
880            // Return an empty iterator if no data to read
881            Ok(Box::new(std::iter::empty()))
882        }
883    }
884
885    fn encoded_range(&self) -> Option<EncodedRange> {
886        Some(EncodedRange {
887            data: self.part.data().clone(),
888            sst_info: self.part.to_sst_info(self.file_id),
889        })
890    }
891}
892
893struct BulkPartWrapper {
894    /// The part to store. It already contains the file id.
895    part: PartToMerge,
896    /// Whether this part is currently being merged.
897    merging: bool,
898}
899
900impl BulkPartWrapper {
901    /// Returns the file id of this part.
902    fn file_id(&self) -> FileId {
903        self.part.file_id()
904    }
905}
906
907/// Enum to wrap different types of parts for unified merging.
908#[derive(Clone)]
909enum PartToMerge {
910    /// Raw bulk part.
911    Bulk { part: BulkPart, file_id: FileId },
912    /// Multiple bulk parts.
913    Multi {
914        part: MultiBulkPart,
915        file_id: FileId,
916    },
917    /// Encoded bulk part.
918    Encoded {
919        part: EncodedBulkPart,
920        file_id: FileId,
921    },
922}
923
924impl PartToMerge {
925    /// Gets the file ID of this part.
926    fn file_id(&self) -> FileId {
927        match self {
928            PartToMerge::Bulk { file_id, .. } => *file_id,
929            PartToMerge::Multi { file_id, .. } => *file_id,
930            PartToMerge::Encoded { file_id, .. } => *file_id,
931        }
932    }
933
934    /// Gets the minimum timestamp of this part.
935    fn min_timestamp(&self) -> i64 {
936        match self {
937            PartToMerge::Bulk { part, .. } => part.min_timestamp,
938            PartToMerge::Multi { part, .. } => part.min_timestamp(),
939            PartToMerge::Encoded { part, .. } => part.metadata().min_timestamp,
940        }
941    }
942
943    /// Gets the maximum timestamp of this part.
944    fn max_timestamp(&self) -> i64 {
945        match self {
946            PartToMerge::Bulk { part, .. } => part.max_timestamp,
947            PartToMerge::Multi { part, .. } => part.max_timestamp(),
948            PartToMerge::Encoded { part, .. } => part.metadata().max_timestamp,
949        }
950    }
951
952    /// Gets the number of rows in this part.
953    fn num_rows(&self) -> usize {
954        match self {
955            PartToMerge::Bulk { part, .. } => part.num_rows(),
956            PartToMerge::Multi { part, .. } => part.num_rows(),
957            PartToMerge::Encoded { part, .. } => part.metadata().num_rows,
958        }
959    }
960
961    /// Gets the maximum sequence number of this part.
962    fn max_sequence(&self) -> u64 {
963        match self {
964            PartToMerge::Bulk { part, .. } => part.sequence,
965            PartToMerge::Multi { part, .. } => part.max_sequence(),
966            PartToMerge::Encoded { part, .. } => part.metadata().max_sequence,
967        }
968    }
969
970    /// Gets the estimated series count in this part.
971    fn series_count(&self) -> usize {
972        match self {
973            PartToMerge::Bulk { part, .. } => part.estimated_series_count(),
974            PartToMerge::Multi { part, .. } => part.series_count(),
975            PartToMerge::Encoded { part, .. } => part.metadata().num_series as usize,
976        }
977    }
978
979    /// Returns true if this is an encoded part.
980    fn is_encoded(&self) -> bool {
981        matches!(self, PartToMerge::Encoded { .. })
982    }
983
984    /// Gets the estimated size in bytes of this part.
985    fn estimated_size(&self) -> usize {
986        match self {
987            PartToMerge::Bulk { part, .. } => part.estimated_size(),
988            PartToMerge::Multi { part, .. } => part.estimated_size(),
989            PartToMerge::Encoded { part, .. } => part.size_bytes(),
990        }
991    }
992
993    /// Converts this part to `MemtableStats`.
994    fn to_memtable_stats(&self, region_metadata: &RegionMetadataRef) -> MemtableStats {
995        match self {
996            PartToMerge::Bulk { part, .. } => part.to_memtable_stats(region_metadata),
997            PartToMerge::Multi { part, .. } => part.to_memtable_stats(region_metadata),
998            PartToMerge::Encoded { part, .. } => part.to_memtable_stats(),
999        }
1000    }
1001
1002    /// Creates a record batch iterator for this part.
1003    fn create_iterator(
1004        self,
1005        context: Arc<BulkIterContext>,
1006    ) -> Result<Option<BoxedRecordBatchIterator>> {
1007        match self {
1008            PartToMerge::Bulk { part, .. } => {
1009                let series_count = part.estimated_series_count();
1010                let iter = BulkPartBatchIter::from_single(
1011                    part.batch,
1012                    context,
1013                    None, // No sequence filter for merging
1014                    series_count,
1015                    None, // No metrics for merging
1016                );
1017                Ok(Some(Box::new(iter) as BoxedRecordBatchIterator))
1018            }
1019            PartToMerge::Multi { part, .. } => part.read(context, None, None),
1020            PartToMerge::Encoded { part, .. } => part.read(context, None, None),
1021        }
1022    }
1023}
1024
1025struct MemtableCompactor {
1026    region_id: RegionId,
1027    memtable_id: MemtableId,
1028    /// Configuration for the bulk memtable.
1029    config: BulkMemtableConfig,
1030}
1031
1032impl MemtableCompactor {
1033    /// Creates a new MemtableCompactor.
1034    fn new(region_id: RegionId, memtable_id: MemtableId, config: BulkMemtableConfig) -> Self {
1035        Self {
1036            region_id,
1037            memtable_id,
1038            config,
1039        }
1040    }
1041
1042    /// Merges parts (bulk and encoded) and then encodes the result.
1043    fn merge_parts(
1044        &mut self,
1045        arrow_schema: &SchemaRef,
1046        bulk_parts: &RwLock<BulkParts>,
1047        metadata: &RegionMetadataRef,
1048        dedup: bool,
1049        merge_mode: MergeMode,
1050    ) -> Result<()> {
1051        let start = Instant::now();
1052
1053        // Collect pre-grouped parts
1054        let collected = bulk_parts
1055            .write()
1056            .unwrap()
1057            .collect_parts_to_merge(self.config.merge_threshold, self.config.max_merge_groups);
1058
1059        if collected.groups.is_empty() {
1060            return Ok(());
1061        }
1062
1063        // Collect all file IDs for tracking
1064        let merged_file_ids: HashSet<FileId> = collected
1065            .groups
1066            .iter()
1067            .flatten()
1068            .map(|part| part.file_id())
1069            .collect();
1070        let mut guard = MergingFlagsGuard::new(bulk_parts, &merged_file_ids);
1071
1072        let num_groups = collected.groups.len();
1073        let num_parts: usize = collected.groups.iter().map(|g| g.len()).sum();
1074
1075        let encode_row_threshold = self.config.encode_row_threshold;
1076        let encode_bytes_threshold = self.config.encode_bytes_threshold;
1077
1078        // Merge all groups in parallel
1079        let merged_parts = collected
1080            .groups
1081            .into_par_iter()
1082            .map(|group| {
1083                Self::merge_parts_group(
1084                    group,
1085                    arrow_schema,
1086                    metadata,
1087                    dedup,
1088                    merge_mode,
1089                    encode_row_threshold,
1090                    encode_bytes_threshold,
1091                )
1092            })
1093            .collect::<Result<Vec<Option<MergedPart>>>>()?;
1094
1095        // Install all merged parts
1096        let total_output_rows = {
1097            let mut parts = bulk_parts.write().unwrap();
1098            parts.install_merged_parts(merged_parts.into_iter().flatten(), &merged_file_ids)
1099        };
1100
1101        guard.mark_success();
1102
1103        common_telemetry::debug!(
1104            "BulkMemtable {} {} concurrent compact {} groups, {} parts, {} rows, cost: {:?}",
1105            self.region_id,
1106            self.memtable_id,
1107            num_groups,
1108            num_parts,
1109            total_output_rows,
1110            start.elapsed()
1111        );
1112
1113        Ok(())
1114    }
1115
1116    /// Merges a group of parts into a single part (either MultiBulkPart or EncodedBulkPart).
1117    fn merge_parts_group(
1118        parts_to_merge: Vec<PartToMerge>,
1119        arrow_schema: &SchemaRef,
1120        metadata: &RegionMetadataRef,
1121        dedup: bool,
1122        merge_mode: MergeMode,
1123        encode_row_threshold: usize,
1124        encode_bytes_threshold: usize,
1125    ) -> Result<Option<MergedPart>> {
1126        if parts_to_merge.is_empty() {
1127            return Ok(None);
1128        }
1129
1130        // Calculates timestamp bounds and statistics for merged data
1131        let min_timestamp = parts_to_merge
1132            .iter()
1133            .map(|p| p.min_timestamp())
1134            .min()
1135            .unwrap_or(i64::MAX);
1136        let max_timestamp = parts_to_merge
1137            .iter()
1138            .map(|p| p.max_timestamp())
1139            .max()
1140            .unwrap_or(i64::MIN);
1141        let max_sequence = parts_to_merge
1142            .iter()
1143            .map(|p| p.max_sequence())
1144            .max()
1145            .unwrap_or(0);
1146
1147        // Collects statistics from parts before creating iterators
1148        let estimated_total_rows: usize = parts_to_merge.iter().map(|p| p.num_rows()).sum();
1149        let estimated_total_bytes: usize = parts_to_merge.iter().map(|p| p.estimated_size()).sum();
1150        let estimated_series_count = parts_to_merge
1151            .iter()
1152            .map(|p| p.series_count())
1153            .max()
1154            .unwrap_or(0);
1155
1156        let context = Arc::new(BulkIterContext::new(
1157            metadata.clone(),
1158            None, // No column projection for merging
1159            None, // No predicate for merging
1160            true,
1161        )?);
1162
1163        // Creates iterators for all parts to merge.
1164        let iterators: Vec<BoxedRecordBatchIterator> = parts_to_merge
1165            .into_iter()
1166            .filter_map(|part| part.create_iterator(context.clone()).ok().flatten())
1167            .collect();
1168
1169        if iterators.is_empty() {
1170            return Ok(None);
1171        }
1172
1173        let merged_iter =
1174            FlatMergeIterator::new(arrow_schema.clone(), iterators, DEFAULT_READ_BATCH_SIZE)?;
1175
1176        let boxed_iter: BoxedRecordBatchIterator = if dedup {
1177            // Applies deduplication based on merge mode
1178            match merge_mode {
1179                MergeMode::LastRow => {
1180                    let dedup_iter = FlatDedupIterator::new(merged_iter, FlatLastRow::new(false));
1181                    Box::new(dedup_iter)
1182                }
1183                MergeMode::LastNonNull => {
1184                    let field_column_start =
1185                        field_column_start(metadata, arrow_schema.fields().len());
1186
1187                    let dedup_iter = FlatDedupIterator::new(
1188                        merged_iter,
1189                        FlatLastNonNull::new(field_column_start, false),
1190                    );
1191                    Box::new(dedup_iter)
1192                }
1193            }
1194        } else {
1195            Box::new(merged_iter)
1196        };
1197
1198        // Encode as EncodedBulkPart if rows exceed row threshold OR bytes exceed bytes threshold
1199        if estimated_total_rows > encode_row_threshold
1200            || estimated_total_bytes > encode_bytes_threshold
1201        {
1202            let encoder = BulkPartEncoder::new(metadata.clone(), DEFAULT_ROW_GROUP_SIZE)?;
1203            let mut metrics = BulkPartEncodeMetrics::default();
1204            let encoded_part = encoder.encode_record_batch_iter(
1205                boxed_iter,
1206                arrow_schema.clone(),
1207                min_timestamp,
1208                max_timestamp,
1209                max_sequence,
1210                &mut metrics,
1211            )?;
1212
1213            common_telemetry::trace!("merge_parts_group metrics: {:?}", metrics);
1214
1215            Ok(encoded_part.map(MergedPart::Encoded))
1216        } else {
1217            // Otherwise, collect into MultiBulkPart
1218            let mut batches = Vec::new();
1219            let mut actual_total_rows = 0;
1220
1221            for batch_result in boxed_iter {
1222                let batch = batch_result?;
1223                actual_total_rows += batch.num_rows();
1224                batches.push(batch);
1225            }
1226
1227            if actual_total_rows == 0 {
1228                return Ok(None);
1229            }
1230
1231            let multi_part = MultiBulkPart::new(
1232                batches,
1233                min_timestamp,
1234                max_timestamp,
1235                max_sequence,
1236                estimated_series_count,
1237            );
1238
1239            common_telemetry::trace!(
1240                "merge_parts_group created MultiBulkPart: rows={}, batches={}",
1241                actual_total_rows,
1242                multi_part.num_batches()
1243            );
1244
1245            Ok(Some(MergedPart::Multi(multi_part)))
1246        }
1247    }
1248}
1249
1250/// A memtable compact task to run in background.
1251struct MemCompactTask {
1252    metadata: RegionMetadataRef,
1253    parts: Arc<RwLock<BulkParts>>,
1254    /// Configuration for the bulk memtable.
1255    config: BulkMemtableConfig,
1256    /// Cached flat SST arrow schema
1257    flat_arrow_schema: SchemaRef,
1258    /// Compactor for merging bulk parts
1259    compactor: Arc<Mutex<MemtableCompactor>>,
1260    /// Whether the append mode is enabled
1261    append_mode: bool,
1262    /// Mode to handle duplicate rows while merging
1263    merge_mode: MergeMode,
1264}
1265
1266impl MemCompactTask {
1267    fn compact(&self) -> Result<()> {
1268        let mut compactor = self.compactor.lock().unwrap();
1269
1270        let should_merge = self
1271            .parts
1272            .read()
1273            .unwrap()
1274            .should_merge_parts(self.config.merge_threshold);
1275        if should_merge {
1276            compactor.merge_parts(
1277                &self.flat_arrow_schema,
1278                &self.parts,
1279                &self.metadata,
1280                !self.append_mode,
1281                self.merge_mode,
1282            )?;
1283        }
1284
1285        Ok(())
1286    }
1287}
1288
1289/// Scheduler to run compact tasks in background.
1290#[derive(Debug)]
1291pub struct CompactDispatcher {
1292    semaphore: Arc<Semaphore>,
1293}
1294
1295impl CompactDispatcher {
1296    /// Creates a new dispatcher with the given number of max concurrent tasks.
1297    pub fn new(permits: usize) -> Self {
1298        Self {
1299            semaphore: Arc::new(Semaphore::new(permits)),
1300        }
1301    }
1302
1303    /// Dispatches a compact task to run in background.
1304    fn dispatch_compact(&self, task: MemCompactTask) {
1305        let semaphore = self.semaphore.clone();
1306        common_runtime::spawn_global(async move {
1307            let Ok(_permit) = semaphore.acquire().await else {
1308                return;
1309            };
1310
1311            common_runtime::spawn_blocking_global(move || {
1312                if let Err(e) = task.compact() {
1313                    common_telemetry::error!(e; "Failed to compact memtable, region: {}", task.metadata.region_id);
1314                }
1315            });
1316        });
1317    }
1318}
1319
1320/// Builder to build a [BulkMemtable].
1321#[derive(Debug, Default)]
1322pub struct BulkMemtableBuilder {
1323    /// Configuration for the bulk memtable.
1324    config: BulkMemtableConfig,
1325    write_buffer_manager: Option<WriteBufferManagerRef>,
1326    compact_dispatcher: Option<Arc<CompactDispatcher>>,
1327    append_mode: bool,
1328    merge_mode: MergeMode,
1329}
1330
1331impl BulkMemtableBuilder {
1332    /// Creates a new builder with specific `write_buffer_manager`.
1333    pub fn new(
1334        write_buffer_manager: Option<WriteBufferManagerRef>,
1335        append_mode: bool,
1336        merge_mode: MergeMode,
1337    ) -> Self {
1338        Self {
1339            config: BulkMemtableConfig::default(),
1340            write_buffer_manager,
1341            compact_dispatcher: None,
1342            append_mode,
1343            merge_mode,
1344        }
1345    }
1346
1347    /// Sets the compact dispatcher.
1348    pub fn with_compact_dispatcher(mut self, compact_dispatcher: Arc<CompactDispatcher>) -> Self {
1349        self.compact_dispatcher = Some(compact_dispatcher);
1350        self
1351    }
1352}
1353
1354impl MemtableBuilder for BulkMemtableBuilder {
1355    fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
1356        Arc::new(BulkMemtable::new(
1357            id,
1358            self.config.clone(),
1359            metadata.clone(),
1360            self.write_buffer_manager.clone(),
1361            self.compact_dispatcher.clone(),
1362            self.append_mode,
1363            self.merge_mode,
1364        ))
1365    }
1366
1367    fn use_bulk_insert(&self, _metadata: &RegionMetadataRef) -> bool {
1368        true
1369    }
1370}
1371
1372#[cfg(test)]
1373mod tests {
1374    use mito_codec::row_converter::build_primary_key_codec;
1375
1376    use super::*;
1377    use crate::memtable::bulk::part::BulkPartConverter;
1378    use crate::read::scan_region::PredicateGroup;
1379    use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
1380    use crate::test_util::memtable_util::{build_key_values_with_ts_seq_values, metadata_for_test};
1381
1382    fn create_bulk_part_with_converter(
1383        k0: &str,
1384        k1: u32,
1385        timestamps: Vec<i64>,
1386        values: Vec<Option<f64>>,
1387        sequence: u64,
1388    ) -> Result<BulkPart> {
1389        let metadata = metadata_for_test();
1390        let capacity = 100;
1391        let primary_key_codec = build_primary_key_codec(&metadata);
1392        let schema = to_flat_sst_arrow_schema(
1393            &metadata,
1394            &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding),
1395        );
1396
1397        let mut converter =
1398            BulkPartConverter::new(&metadata, schema, capacity, primary_key_codec, true);
1399
1400        let key_values = build_key_values_with_ts_seq_values(
1401            &metadata,
1402            k0.to_string(),
1403            k1,
1404            timestamps.into_iter(),
1405            values.into_iter(),
1406            sequence,
1407        );
1408
1409        converter.append_key_values(&key_values)?;
1410        converter.convert()
1411    }
1412
1413    #[test]
1414    fn test_bulk_memtable_write_read() {
1415        let metadata = metadata_for_test();
1416        let memtable = BulkMemtable::new(
1417            999,
1418            BulkMemtableConfig::default(),
1419            metadata.clone(),
1420            None,
1421            None,
1422            false,
1423            MergeMode::LastRow,
1424        );
1425        // Disable unordered_part for this test
1426        memtable.set_unordered_part_threshold(0);
1427
1428        let test_data = [
1429            (
1430                "key_a",
1431                1u32,
1432                vec![1000i64, 2000i64],
1433                vec![Some(10.5), Some(20.5)],
1434                100u64,
1435            ),
1436            (
1437                "key_b",
1438                2u32,
1439                vec![1500i64, 2500i64],
1440                vec![Some(15.5), Some(25.5)],
1441                200u64,
1442            ),
1443            ("key_c", 3u32, vec![3000i64], vec![Some(30.5)], 300u64),
1444        ];
1445
1446        for (k0, k1, timestamps, values, seq) in test_data.iter() {
1447            let part =
1448                create_bulk_part_with_converter(k0, *k1, timestamps.clone(), values.clone(), *seq)
1449                    .unwrap();
1450            memtable.write_bulk(part).unwrap();
1451        }
1452
1453        let stats = memtable.stats();
1454        assert_eq!(5, stats.num_rows);
1455        assert_eq!(3, stats.num_ranges);
1456        assert_eq!(300, stats.max_sequence);
1457
1458        let (min_ts, max_ts) = stats.time_range.unwrap();
1459        assert_eq!(1000, min_ts.value());
1460        assert_eq!(3000, max_ts.value());
1461
1462        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1463        let ranges = memtable
1464            .ranges(
1465                None,
1466                RangesOptions::default().with_predicate(predicate_group),
1467            )
1468            .unwrap();
1469
1470        assert_eq!(3, ranges.ranges.len());
1471        let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
1472        assert_eq!(5, total_rows);
1473
1474        for (_range_id, range) in ranges.ranges.iter() {
1475            assert!(range.num_rows() > 0);
1476            assert!(range.is_record_batch());
1477
1478            let record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
1479
1480            let mut total_rows = 0;
1481            for batch_result in record_batch_iter {
1482                let batch = batch_result.unwrap();
1483                total_rows += batch.num_rows();
1484                assert!(batch.num_rows() > 0);
1485                assert_eq!(8, batch.num_columns());
1486            }
1487            assert_eq!(total_rows, range.num_rows());
1488        }
1489    }
1490
1491    #[test]
1492    fn test_bulk_memtable_ranges_with_projection() {
1493        let metadata = metadata_for_test();
1494        let memtable = BulkMemtable::new(
1495            111,
1496            BulkMemtableConfig::default(),
1497            metadata.clone(),
1498            None,
1499            None,
1500            false,
1501            MergeMode::LastRow,
1502        );
1503
1504        let bulk_part = create_bulk_part_with_converter(
1505            "projection_test",
1506            5,
1507            vec![5000, 6000, 7000],
1508            vec![Some(50.0), Some(60.0), Some(70.0)],
1509            500,
1510        )
1511        .unwrap();
1512
1513        memtable.write_bulk(bulk_part).unwrap();
1514
1515        let projection = vec![4u32];
1516        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1517        let ranges = memtable
1518            .ranges(
1519                Some(&projection),
1520                RangesOptions::default().with_predicate(predicate_group),
1521            )
1522            .unwrap();
1523
1524        assert_eq!(1, ranges.ranges.len());
1525        let range = ranges.ranges.get(&0).unwrap();
1526
1527        assert!(range.is_record_batch());
1528        let record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
1529
1530        let mut total_rows = 0;
1531        for batch_result in record_batch_iter {
1532            let batch = batch_result.unwrap();
1533            assert!(batch.num_rows() > 0);
1534            assert_eq!(5, batch.num_columns());
1535            total_rows += batch.num_rows();
1536        }
1537        assert_eq!(3, total_rows);
1538    }
1539
1540    #[test]
1541    fn test_bulk_memtable_unsupported_operations() {
1542        let metadata = metadata_for_test();
1543        let memtable = BulkMemtable::new(
1544            111,
1545            BulkMemtableConfig::default(),
1546            metadata.clone(),
1547            None,
1548            None,
1549            false,
1550            MergeMode::LastRow,
1551        );
1552
1553        let key_values = build_key_values_with_ts_seq_values(
1554            &metadata,
1555            "test".to_string(),
1556            1,
1557            vec![1000].into_iter(),
1558            vec![Some(1.0)].into_iter(),
1559            1,
1560        );
1561
1562        let err = memtable.write(&key_values).unwrap_err();
1563        assert!(err.to_string().contains("not supported"));
1564
1565        let kv = key_values.iter().next().unwrap();
1566        let err = memtable.write_one(kv).unwrap_err();
1567        assert!(err.to_string().contains("not supported"));
1568    }
1569
1570    #[test]
1571    fn test_bulk_memtable_freeze() {
1572        let metadata = metadata_for_test();
1573        let memtable = BulkMemtable::new(
1574            222,
1575            BulkMemtableConfig::default(),
1576            metadata.clone(),
1577            None,
1578            None,
1579            false,
1580            MergeMode::LastRow,
1581        );
1582
1583        let bulk_part = create_bulk_part_with_converter(
1584            "freeze_test",
1585            10,
1586            vec![10000],
1587            vec![Some(100.0)],
1588            1000,
1589        )
1590        .unwrap();
1591
1592        memtable.write_bulk(bulk_part).unwrap();
1593        memtable.freeze().unwrap();
1594
1595        let stats_after_freeze = memtable.stats();
1596        assert_eq!(1, stats_after_freeze.num_rows);
1597    }
1598
1599    #[test]
1600    fn test_bulk_memtable_fork() {
1601        let metadata = metadata_for_test();
1602        let original_memtable = BulkMemtable::new(
1603            333,
1604            BulkMemtableConfig::default(),
1605            metadata.clone(),
1606            None,
1607            None,
1608            false,
1609            MergeMode::LastRow,
1610        );
1611
1612        let bulk_part =
1613            create_bulk_part_with_converter("fork_test", 15, vec![15000], vec![Some(150.0)], 1500)
1614                .unwrap();
1615
1616        original_memtable.write_bulk(bulk_part).unwrap();
1617
1618        let forked_memtable = original_memtable.fork(444, &metadata);
1619
1620        assert_eq!(forked_memtable.id(), 444);
1621        assert!(forked_memtable.is_empty());
1622        assert_eq!(0, forked_memtable.stats().num_rows);
1623
1624        assert!(!original_memtable.is_empty());
1625        assert_eq!(1, original_memtable.stats().num_rows);
1626    }
1627
1628    #[test]
1629    fn test_bulk_memtable_ranges_multiple_parts() {
1630        let metadata = metadata_for_test();
1631        let memtable = BulkMemtable::new(
1632            777,
1633            BulkMemtableConfig::default(),
1634            metadata.clone(),
1635            None,
1636            None,
1637            false,
1638            MergeMode::LastRow,
1639        );
1640        // Disable unordered_part for this test
1641        memtable.set_unordered_part_threshold(0);
1642
1643        let parts_data = vec![
1644            (
1645                "part1",
1646                1u32,
1647                vec![1000i64, 1100i64],
1648                vec![Some(10.0), Some(11.0)],
1649                100u64,
1650            ),
1651            (
1652                "part2",
1653                2u32,
1654                vec![2000i64, 2100i64],
1655                vec![Some(20.0), Some(21.0)],
1656                200u64,
1657            ),
1658            ("part3", 3u32, vec![3000i64], vec![Some(30.0)], 300u64),
1659        ];
1660
1661        for (k0, k1, timestamps, values, seq) in parts_data {
1662            let part = create_bulk_part_with_converter(k0, k1, timestamps, values, seq).unwrap();
1663            memtable.write_bulk(part).unwrap();
1664        }
1665
1666        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1667        let ranges = memtable
1668            .ranges(
1669                None,
1670                RangesOptions::default().with_predicate(predicate_group),
1671            )
1672            .unwrap();
1673
1674        assert_eq!(3, ranges.ranges.len());
1675        let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
1676        assert_eq!(5, total_rows);
1677        assert_eq!(3, ranges.ranges.len());
1678
1679        for (range_id, range) in ranges.ranges.iter() {
1680            assert!(*range_id < 3);
1681            assert!(range.num_rows() > 0);
1682            assert!(range.is_record_batch());
1683        }
1684    }
1685
1686    #[test]
1687    fn test_bulk_memtable_ranges_with_sequence_filter() {
1688        let metadata = metadata_for_test();
1689        let memtable = BulkMemtable::new(
1690            888,
1691            BulkMemtableConfig::default(),
1692            metadata.clone(),
1693            None,
1694            None,
1695            false,
1696            MergeMode::LastRow,
1697        );
1698
1699        let part = create_bulk_part_with_converter(
1700            "seq_test",
1701            1,
1702            vec![1000, 2000, 3000],
1703            vec![Some(10.0), Some(20.0), Some(30.0)],
1704            500,
1705        )
1706        .unwrap();
1707
1708        memtable.write_bulk(part).unwrap();
1709
1710        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1711        let sequence_filter = Some(SequenceRange::LtEq { max: 400 }); // Filters out rows with sequence > 400
1712        let ranges = memtable
1713            .ranges(
1714                None,
1715                RangesOptions::default()
1716                    .with_predicate(predicate_group)
1717                    .with_sequence(sequence_filter),
1718            )
1719            .unwrap();
1720
1721        assert_eq!(1, ranges.ranges.len());
1722        let range = ranges.ranges.get(&0).unwrap();
1723
1724        let mut record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
1725        assert!(record_batch_iter.next().is_none());
1726    }
1727
1728    #[test]
1729    fn test_bulk_memtable_ranges_with_encoded_parts() {
1730        let metadata = metadata_for_test();
1731        let config = BulkMemtableConfig {
1732            merge_threshold: 8,
1733            ..Default::default()
1734        };
1735        let memtable = BulkMemtable::new(
1736            999,
1737            config,
1738            metadata.clone(),
1739            None,
1740            None,
1741            false,
1742            MergeMode::LastRow,
1743        );
1744        // Disable unordered_part for this test
1745        memtable.set_unordered_part_threshold(0);
1746
1747        // Adds enough bulk parts to trigger encoding
1748        for i in 0..10 {
1749            let part = create_bulk_part_with_converter(
1750                &format!("key_{}", i),
1751                i,
1752                vec![1000 + i as i64 * 100],
1753                vec![Some(i as f64 * 10.0)],
1754                100 + i as u64,
1755            )
1756            .unwrap();
1757            memtable.write_bulk(part).unwrap();
1758        }
1759
1760        memtable.compact(false).unwrap();
1761
1762        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1763        let ranges = memtable
1764            .ranges(
1765                None,
1766                RangesOptions::default().with_predicate(predicate_group),
1767            )
1768            .unwrap();
1769
1770        // Should have ranges for both bulk parts and encoded parts
1771        assert_eq!(3, ranges.ranges.len());
1772        let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
1773        assert_eq!(10, total_rows);
1774
1775        for (_range_id, range) in ranges.ranges.iter() {
1776            assert!(range.num_rows() > 0);
1777            assert!(range.is_record_batch());
1778
1779            let record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
1780            let mut total_rows = 0;
1781            for batch_result in record_batch_iter {
1782                let batch = batch_result.unwrap();
1783                total_rows += batch.num_rows();
1784                assert!(batch.num_rows() > 0);
1785            }
1786            assert_eq!(total_rows, range.num_rows());
1787        }
1788    }
1789
1790    #[test]
1791    fn test_bulk_memtable_unordered_part() {
1792        let metadata = metadata_for_test();
1793        let memtable = BulkMemtable::new(
1794            1001,
1795            BulkMemtableConfig::default(),
1796            metadata.clone(),
1797            None,
1798            None,
1799            false,
1800            MergeMode::LastRow,
1801        );
1802
1803        // Set smaller thresholds for testing with smaller inputs
1804        // Accept parts with < 5 rows into unordered_part
1805        memtable.set_unordered_part_threshold(5);
1806        // Compact when total rows >= 10
1807        memtable.set_unordered_part_compact_threshold(10);
1808
1809        // Write 3 small parts (each has 2 rows), should be collected in unordered_part
1810        for i in 0..3 {
1811            let part = create_bulk_part_with_converter(
1812                &format!("key_{}", i),
1813                i,
1814                vec![1000 + i as i64 * 100, 1100 + i as i64 * 100],
1815                vec![Some(i as f64 * 10.0), Some(i as f64 * 10.0 + 1.0)],
1816                100 + i as u64,
1817            )
1818            .unwrap();
1819            assert_eq!(2, part.num_rows());
1820            memtable.write_bulk(part).unwrap();
1821        }
1822
1823        // Total rows = 6, not yet reaching compact threshold
1824        let stats = memtable.stats();
1825        assert_eq!(6, stats.num_rows);
1826
1827        // Write 2 more small parts (each has 2 rows)
1828        // This should trigger compaction when total >= 10
1829        for i in 3..5 {
1830            let part = create_bulk_part_with_converter(
1831                &format!("key_{}", i),
1832                i,
1833                vec![1000 + i as i64 * 100, 1100 + i as i64 * 100],
1834                vec![Some(i as f64 * 10.0), Some(i as f64 * 10.0 + 1.0)],
1835                100 + i as u64,
1836            )
1837            .unwrap();
1838            memtable.write_bulk(part).unwrap();
1839        }
1840
1841        // Total rows = 10, should have compacted unordered_part into a regular part
1842        let stats = memtable.stats();
1843        assert_eq!(10, stats.num_rows);
1844
1845        // Verify we can read all data correctly
1846        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1847        let ranges = memtable
1848            .ranges(
1849                None,
1850                RangesOptions::default().with_predicate(predicate_group),
1851            )
1852            .unwrap();
1853
1854        // Should have at least 1 range (the compacted part)
1855        assert!(!ranges.ranges.is_empty());
1856        let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
1857        assert_eq!(10, total_rows);
1858
1859        // Read all data and verify
1860        let mut total_rows_read = 0;
1861        for (_range_id, range) in ranges.ranges.iter() {
1862            assert!(range.is_record_batch());
1863            let record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
1864
1865            for batch_result in record_batch_iter {
1866                let batch = batch_result.unwrap();
1867                total_rows_read += batch.num_rows();
1868            }
1869        }
1870        assert_eq!(10, total_rows_read);
1871    }
1872
1873    #[test]
1874    fn test_bulk_memtable_unordered_part_mixed_sizes() {
1875        let metadata = metadata_for_test();
1876        let memtable = BulkMemtable::new(
1877            1002,
1878            BulkMemtableConfig::default(),
1879            metadata.clone(),
1880            None,
1881            None,
1882            false,
1883            MergeMode::LastRow,
1884        );
1885
1886        // Set threshold to 4 rows - parts with < 4 rows go to unordered_part
1887        memtable.set_unordered_part_threshold(4);
1888        memtable.set_unordered_part_compact_threshold(8);
1889
1890        // Write small parts (3 rows each) - should go to unordered_part
1891        for i in 0..2 {
1892            let part = create_bulk_part_with_converter(
1893                &format!("small_{}", i),
1894                i,
1895                vec![1000 + i as i64, 2000 + i as i64, 3000 + i as i64],
1896                vec![Some(i as f64), Some(i as f64 + 1.0), Some(i as f64 + 2.0)],
1897                10 + i as u64,
1898            )
1899            .unwrap();
1900            assert_eq!(3, part.num_rows());
1901            memtable.write_bulk(part).unwrap();
1902        }
1903
1904        // Write a large part (5 rows) - should go directly to regular parts
1905        let large_part = create_bulk_part_with_converter(
1906            "large_key",
1907            100,
1908            vec![5000, 6000, 7000, 8000, 9000],
1909            vec![
1910                Some(100.0),
1911                Some(101.0),
1912                Some(102.0),
1913                Some(103.0),
1914                Some(104.0),
1915            ],
1916            50,
1917        )
1918        .unwrap();
1919        assert_eq!(5, large_part.num_rows());
1920        memtable.write_bulk(large_part).unwrap();
1921
1922        // Write another small part (2 rows) - should trigger compaction of unordered_part
1923        let part = create_bulk_part_with_converter(
1924            "small_2",
1925            2,
1926            vec![4000, 4100],
1927            vec![Some(20.0), Some(21.0)],
1928            30,
1929        )
1930        .unwrap();
1931        memtable.write_bulk(part).unwrap();
1932
1933        let stats = memtable.stats();
1934        assert_eq!(13, stats.num_rows); // 3 + 3 + 5 + 2 = 13
1935
1936        // Verify all data can be read
1937        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1938        let ranges = memtable
1939            .ranges(
1940                None,
1941                RangesOptions::default().with_predicate(predicate_group),
1942            )
1943            .unwrap();
1944
1945        let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
1946        assert_eq!(13, total_rows);
1947
1948        let mut total_rows_read = 0;
1949        for (_range_id, range) in ranges.ranges.iter() {
1950            let record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
1951            for batch_result in record_batch_iter {
1952                let batch = batch_result.unwrap();
1953                total_rows_read += batch.num_rows();
1954            }
1955        }
1956        assert_eq!(13, total_rows_read);
1957    }
1958
1959    #[test]
1960    fn test_bulk_memtable_unordered_part_with_ranges() {
1961        let metadata = metadata_for_test();
1962        let memtable = BulkMemtable::new(
1963            1003,
1964            BulkMemtableConfig::default(),
1965            metadata.clone(),
1966            None,
1967            None,
1968            false,
1969            MergeMode::LastRow,
1970        );
1971
1972        // Set small thresholds
1973        memtable.set_unordered_part_threshold(3);
1974        memtable.set_unordered_part_compact_threshold(100); // High threshold to prevent auto-compaction
1975
1976        // Write several small parts that stay in unordered_part
1977        for i in 0..3 {
1978            let part = create_bulk_part_with_converter(
1979                &format!("key_{}", i),
1980                i,
1981                vec![1000 + i as i64 * 100],
1982                vec![Some(i as f64 * 10.0)],
1983                100 + i as u64,
1984            )
1985            .unwrap();
1986            assert_eq!(1, part.num_rows());
1987            memtable.write_bulk(part).unwrap();
1988        }
1989
1990        let stats = memtable.stats();
1991        assert_eq!(3, stats.num_rows);
1992
1993        // Test that ranges() can correctly read from unordered_part
1994        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
1995        let ranges = memtable
1996            .ranges(
1997                None,
1998                RangesOptions::default().with_predicate(predicate_group),
1999            )
2000            .unwrap();
2001
2002        // Should have 1 range for the unordered_part
2003        assert_eq!(1, ranges.ranges.len());
2004        let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
2005        assert_eq!(3, total_rows);
2006
2007        // Verify data is sorted correctly in the range
2008        let range = ranges.ranges.get(&0).unwrap();
2009        let record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
2010
2011        let mut total_rows = 0;
2012        for batch_result in record_batch_iter {
2013            let batch = batch_result.unwrap();
2014            total_rows += batch.num_rows();
2015            // Verify data is properly sorted by primary key
2016            assert!(batch.num_rows() > 0);
2017        }
2018        assert_eq!(3, total_rows);
2019    }
2020
2021    /// Helper to create a BulkPartWrapper from a BulkPart.
2022    fn create_bulk_part_wrapper(part: BulkPart) -> BulkPartWrapper {
2023        BulkPartWrapper {
2024            part: PartToMerge::Bulk {
2025                part,
2026                file_id: FileId::random(),
2027            },
2028            merging: false,
2029        }
2030    }
2031
2032    #[test]
2033    fn test_should_merge_parts_below_threshold() {
2034        let mut bulk_parts = BulkParts::default();
2035
2036        // Add 7 bulk parts (below DEFAULT_MERGE_THRESHOLD of 8)
2037        for i in 0..DEFAULT_MERGE_THRESHOLD - 1 {
2038            let part = create_bulk_part_with_converter(
2039                &format!("key_{}", i),
2040                i as u32,
2041                vec![1000 + i as i64 * 100],
2042                vec![Some(i as f64 * 10.0)],
2043                100 + i as u64,
2044            )
2045            .unwrap();
2046            bulk_parts.parts.push(create_bulk_part_wrapper(part));
2047        }
2048
2049        // Should not trigger merge since we have only 7 parts
2050        assert!(!bulk_parts.should_merge_parts(DEFAULT_MERGE_THRESHOLD));
2051    }
2052
2053    #[test]
2054    fn test_should_merge_parts_at_threshold() {
2055        let mut bulk_parts = BulkParts::default();
2056        let merge_threshold = 8;
2057
2058        // Add 8 bulk parts (at merge_threshold)
2059        for i in 0..merge_threshold {
2060            let part = create_bulk_part_with_converter(
2061                &format!("key_{}", i),
2062                i as u32,
2063                vec![1000 + i as i64 * 100],
2064                vec![Some(i as f64 * 10.0)],
2065                100 + i as u64,
2066            )
2067            .unwrap();
2068            bulk_parts.parts.push(create_bulk_part_wrapper(part));
2069        }
2070
2071        // Should trigger merge since we have 8 parts
2072        assert!(bulk_parts.should_merge_parts(merge_threshold));
2073    }
2074
2075    #[test]
2076    fn test_should_merge_parts_with_merging_flag() {
2077        let mut bulk_parts = BulkParts::default();
2078        let merge_threshold = 8;
2079
2080        // Add 10 bulk parts
2081        for i in 0..10 {
2082            let part = create_bulk_part_with_converter(
2083                &format!("key_{}", i),
2084                i as u32,
2085                vec![1000 + i as i64 * 100],
2086                vec![Some(i as f64 * 10.0)],
2087                100 + i as u64,
2088            )
2089            .unwrap();
2090            bulk_parts.parts.push(create_bulk_part_wrapper(part));
2091        }
2092
2093        // Should trigger merge since we have 10 parts
2094        assert!(bulk_parts.should_merge_parts(merge_threshold));
2095
2096        // Mark first 3 parts as merging
2097        for wrapper in bulk_parts.parts.iter_mut().take(3) {
2098            wrapper.merging = true;
2099        }
2100
2101        // Now only 7 parts are available for merging, should not trigger
2102        assert!(!bulk_parts.should_merge_parts(merge_threshold));
2103    }
2104
2105    #[test]
2106    fn test_collect_parts_to_merge_grouping() {
2107        let mut bulk_parts = BulkParts::default();
2108
2109        // Add 16 bulk parts with different row counts
2110        for i in 0..16 {
2111            let num_rows = (i % 4) + 1; // 1 to 4 rows
2112            let timestamps: Vec<i64> = (0..num_rows)
2113                .map(|j| 1000 + i as i64 * 100 + j as i64)
2114                .collect();
2115            let values: Vec<Option<f64>> =
2116                (0..num_rows).map(|j| Some((i * 10 + j) as f64)).collect();
2117            let part = create_bulk_part_with_converter(
2118                &format!("key_{}", i),
2119                i as u32,
2120                timestamps,
2121                values,
2122                100 + i as u64,
2123            )
2124            .unwrap();
2125            bulk_parts.parts.push(create_bulk_part_wrapper(part));
2126        }
2127
2128        // Should trigger merge since we have 16 parts
2129        assert!(bulk_parts.should_merge_parts(DEFAULT_MERGE_THRESHOLD));
2130
2131        // Collect parts to merge
2132        let collected =
2133            bulk_parts.collect_parts_to_merge(DEFAULT_MERGE_THRESHOLD, DEFAULT_MAX_MERGE_GROUPS);
2134
2135        // Should have groups
2136        assert!(!collected.groups.is_empty());
2137
2138        // All groups should have parts
2139        for group in &collected.groups {
2140            assert!(!group.is_empty());
2141        }
2142
2143        // Total parts collected should be 16
2144        let total_parts: usize = collected.groups.iter().map(|g| g.len()).sum();
2145        assert_eq!(16, total_parts);
2146    }
2147
2148    #[test]
2149    fn test_bulk_memtable_ranges_with_multi_bulk_part() {
2150        let metadata = metadata_for_test();
2151        let merge_threshold = 8;
2152        let config = BulkMemtableConfig {
2153            merge_threshold,
2154            ..Default::default()
2155        };
2156        let memtable = BulkMemtable::new(
2157            2005,
2158            config,
2159            metadata.clone(),
2160            None,
2161            None,
2162            false,
2163            MergeMode::LastRow,
2164        );
2165        // Disable unordered_part for this test
2166        memtable.set_unordered_part_threshold(0);
2167
2168        // Write enough bulk parts to trigger merge (merge_threshold = 8)
2169        // Each part has small number of rows so total < DEFAULT_ROW_GROUP_SIZE
2170        // This will result in MultiBulkPart after compaction
2171        for i in 0..merge_threshold {
2172            let part = create_bulk_part_with_converter(
2173                &format!("key_{}", i),
2174                i as u32,
2175                vec![1000 + i as i64 * 100, 2000 + i as i64 * 100],
2176                vec![Some(i as f64 * 10.0), Some(i as f64 * 10.0 + 1.0)],
2177                100 + i as u64,
2178            )
2179            .unwrap();
2180            memtable.write_bulk(part).unwrap();
2181        }
2182
2183        // Compact to trigger MultiBulkPart creation (since total rows < DEFAULT_ROW_GROUP_SIZE)
2184        memtable.compact(false).unwrap();
2185
2186        // Verify we can read from the memtable
2187        let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap();
2188        let ranges = memtable
2189            .ranges(
2190                None,
2191                RangesOptions::default().with_predicate(predicate_group),
2192            )
2193            .unwrap();
2194
2195        assert_eq!(1, ranges.ranges.len());
2196        let expected_rows = merge_threshold * 2; // Each part has 2 rows
2197        let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
2198        assert_eq!(expected_rows, total_rows);
2199
2200        // Read all data
2201        let mut total_rows_read = 0;
2202        for (_range_id, range) in ranges.ranges.iter() {
2203            assert!(range.is_record_batch());
2204            let record_batch_iter = range.build_record_batch_iter(None, None).unwrap();
2205
2206            for batch_result in record_batch_iter {
2207                let batch = batch_result.unwrap();
2208                total_rows_read += batch.num_rows();
2209            }
2210        }
2211        assert_eq!(expected_rows, total_rows_read);
2212    }
2213}