Skip to main content

mito2/memtable/bulk/
part_reader.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::VecDeque;
16use std::time::Instant;
17
18use datatypes::arrow::array::BooleanArray;
19use datatypes::arrow::record_batch::RecordBatch;
20use mito_codec::row_converter::PrimaryKeyFilter;
21use parquet::arrow::ProjectionMask;
22use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
23use snafu::ResultExt;
24use store_api::storage::SequenceRange;
25
26use crate::error::{self, ComputeArrowSnafu, DecodeArrowRowGroupSnafu};
27use crate::memtable::bulk::context::{BulkIterContext, BulkIterContextRef};
28use crate::memtable::bulk::part::EncodedBulkPart;
29use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder;
30use crate::memtable::{MemScanMetrics, MemScanMetricsData};
31use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
32use crate::sst::parquet::file_range::TagDecodeState;
33use crate::sst::parquet::flat_format::{primary_key_column_index, sequence_column_index};
34use crate::sst::parquet::prefilter::{CachedPrimaryKeyFilter, prefilter_flat_batch_by_primary_key};
35
36/// Iterator for reading data inside a bulk part.
37pub struct EncodedBulkPartIter {
38    context: BulkIterContextRef,
39    row_groups_to_read: VecDeque<usize>,
40    current_reader: Option<ParquetRecordBatchReader>,
41    builder: MemtableRowGroupReaderBuilder,
42    /// Sequence number filter.
43    sequence: Option<SequenceRange>,
44    /// Cached skip_fields for current row group.
45    current_skip_fields: bool,
46    /// Primary key filter for prefiltering before convert_batch.
47    pk_filter: Option<CachedPrimaryKeyFilter>,
48    /// Metrics for this iterator.
49    metrics: MemScanMetricsData,
50    /// Optional memory scan metrics to report to.
51    mem_scan_metrics: Option<MemScanMetrics>,
52}
53
54impl EncodedBulkPartIter {
55    /// Creates a new [BulkPartIter].
56    pub fn try_new(
57        encoded_part: &EncodedBulkPart,
58        context: BulkIterContextRef,
59        mut row_groups_to_read: VecDeque<usize>,
60        sequence: Option<SequenceRange>,
61        mem_scan_metrics: Option<MemScanMetrics>,
62    ) -> error::Result<Self> {
63        let parquet_meta = encoded_part.metadata().parquet_metadata.clone();
64        let data = encoded_part.data().clone();
65        let series_count = encoded_part.metadata().num_series as usize;
66
67        let projection_mask = ProjectionMask::roots(
68            parquet_meta.file_metadata().schema_descr(),
69            context.read_format().projection_indices().iter().copied(),
70        );
71        let builder =
72            MemtableRowGroupReaderBuilder::try_new(&context, projection_mask, parquet_meta, data)?;
73
74        // Build PK filter if applicable (flat format with dictionary-encoded PKs).
75        let pk_filter = context.build_pk_filter();
76
77        let (init_reader, current_skip_fields) = match row_groups_to_read.pop_front() {
78            Some(first_row_group) => {
79                let skip_fields = context.pre_filter_mode().skip_fields();
80                let reader = builder.build_row_group_reader(first_row_group, None)?;
81                (Some(reader), skip_fields)
82            }
83            None => (None, false),
84        };
85
86        Ok(Self {
87            context,
88            row_groups_to_read,
89            current_reader: init_reader,
90            builder,
91            sequence,
92            current_skip_fields,
93            pk_filter,
94            metrics: MemScanMetricsData {
95                total_series: series_count,
96                ..Default::default()
97            },
98            mem_scan_metrics,
99        })
100    }
101
102    fn report_mem_scan_metrics(&mut self) {
103        if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() {
104            mem_scan_metrics.merge_inner(&self.metrics);
105        }
106    }
107
108    /// Fetches next non-empty record batch.
109    pub(crate) fn next_record_batch(&mut self) -> error::Result<Option<RecordBatch>> {
110        let start = Instant::now();
111
112        let Some(current) = &mut self.current_reader else {
113            // All row group exhausted.
114            self.metrics.scan_cost += start.elapsed();
115            return Ok(None);
116        };
117
118        for batch in current {
119            let batch = batch.context(DecodeArrowRowGroupSnafu)?;
120            if let Some(batch) = apply_combined_filters(
121                &self.context,
122                &self.sequence,
123                batch,
124                self.current_skip_fields,
125                self.pk_filter
126                    .as_mut()
127                    .map(|f| f as &mut dyn PrimaryKeyFilter),
128                &mut self.metrics,
129            )? {
130                // Update metrics
131                self.metrics.num_batches += 1;
132                self.metrics.num_rows += batch.num_rows();
133                self.metrics.scan_cost += start.elapsed();
134                return Ok(Some(batch));
135            }
136        }
137
138        // Previous row group exhausted, read next row group
139        while let Some(next_row_group) = self.row_groups_to_read.pop_front() {
140            // Compute skip_fields for this row group
141            self.current_skip_fields = self.context.pre_filter_mode().skip_fields();
142
143            let next_reader = self.builder.build_row_group_reader(next_row_group, None)?;
144            let current = self.current_reader.insert(next_reader);
145
146            for batch in current {
147                let batch = batch.context(DecodeArrowRowGroupSnafu)?;
148                if let Some(batch) = apply_combined_filters(
149                    &self.context,
150                    &self.sequence,
151                    batch,
152                    self.current_skip_fields,
153                    self.pk_filter
154                        .as_mut()
155                        .map(|f| f as &mut dyn PrimaryKeyFilter),
156                    &mut self.metrics,
157                )? {
158                    // Update metrics
159                    self.metrics.num_batches += 1;
160                    self.metrics.num_rows += batch.num_rows();
161                    self.metrics.scan_cost += start.elapsed();
162                    return Ok(Some(batch));
163                }
164            }
165        }
166
167        self.metrics.scan_cost += start.elapsed();
168        Ok(None)
169    }
170}
171
172impl Iterator for EncodedBulkPartIter {
173    type Item = error::Result<RecordBatch>;
174
175    fn next(&mut self) -> Option<Self::Item> {
176        let result = self.next_record_batch().transpose();
177
178        // Report metrics when iteration is complete
179        if result.is_none() {
180            self.report_mem_scan_metrics();
181        }
182
183        result
184    }
185}
186
187impl Drop for EncodedBulkPartIter {
188    fn drop(&mut self) {
189        common_telemetry::debug!(
190            "EncodedBulkPartIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}",
191            self.context.region_id(),
192            self.metrics.total_series,
193            self.metrics.num_rows,
194            self.metrics.num_batches,
195            self.metrics.scan_cost,
196            self.metrics.prefilter_cost,
197            self.metrics.prefilter_rows_filtered
198        );
199
200        // Report MemScanMetrics if not already reported
201        self.report_mem_scan_metrics();
202
203        READ_ROWS_TOTAL
204            .with_label_values(&["bulk_memtable"])
205            .inc_by(self.metrics.num_rows as u64);
206        READ_STAGE_ELAPSED
207            .with_label_values(&["scan_memtable"])
208            .observe(self.metrics.scan_cost.as_secs_f64());
209    }
210}
211
212/// Iterator for reading record batches from a bulk part.
213///
214/// Iterates through one or more RecordBatches, applying filters and projections.
215pub struct BulkPartBatchIter {
216    /// Queue of RecordBatches to process.
217    batches: VecDeque<RecordBatch>,
218    /// Iterator context for filtering and projection.
219    context: BulkIterContextRef,
220    /// Sequence number filter.
221    sequence: Option<SequenceRange>,
222    /// Primary key filter for prefiltering before convert_batch.
223    pk_filter: Option<CachedPrimaryKeyFilter>,
224    /// Metrics for this iterator.
225    metrics: MemScanMetricsData,
226    /// Optional memory scan metrics to report to.
227    mem_scan_metrics: Option<MemScanMetrics>,
228}
229
230impl BulkPartBatchIter {
231    /// Creates a new [BulkPartBatchIter] from multiple RecordBatches.
232    pub fn new(
233        batches: Vec<RecordBatch>,
234        context: BulkIterContextRef,
235        sequence: Option<SequenceRange>,
236        series_count: usize,
237        mem_scan_metrics: Option<MemScanMetrics>,
238    ) -> Self {
239        let pk_filter = context.build_pk_filter();
240
241        Self {
242            batches: VecDeque::from(batches),
243            context,
244            sequence,
245            pk_filter,
246            metrics: MemScanMetricsData {
247                total_series: series_count,
248                ..Default::default()
249            },
250            mem_scan_metrics,
251        }
252    }
253
254    /// Creates a new [BulkPartBatchIter] from a single RecordBatch.
255    pub fn from_single(
256        record_batch: RecordBatch,
257        context: BulkIterContextRef,
258        sequence: Option<SequenceRange>,
259        series_count: usize,
260        mem_scan_metrics: Option<MemScanMetrics>,
261    ) -> Self {
262        Self::new(
263            vec![record_batch],
264            context,
265            sequence,
266            series_count,
267            mem_scan_metrics,
268        )
269    }
270
271    fn report_mem_scan_metrics(&mut self) {
272        if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() {
273            mem_scan_metrics.merge_inner(&self.metrics);
274        }
275    }
276
277    /// Applies projection to the RecordBatch if needed.
278    fn apply_projection(&self, record_batch: RecordBatch) -> error::Result<RecordBatch> {
279        let projection_indices = self.context.read_format().projection_indices();
280        if projection_indices.len() == record_batch.num_columns() {
281            return Ok(record_batch);
282        }
283
284        record_batch
285            .project(projection_indices)
286            .context(ComputeArrowSnafu)
287    }
288
289    fn process_batch(&mut self, record_batch: RecordBatch) -> error::Result<Option<RecordBatch>> {
290        let start = Instant::now();
291
292        // Apply projection first.
293        let projected_batch = self.apply_projection(record_batch)?;
294
295        // Apply combined filtering (both predicate and sequence filters)
296        let skip_fields = self.context.pre_filter_mode().skip_fields();
297
298        let Some(filtered_batch) = apply_combined_filters(
299            &self.context,
300            &self.sequence,
301            projected_batch,
302            skip_fields,
303            self.pk_filter
304                .as_mut()
305                .map(|f| f as &mut dyn PrimaryKeyFilter),
306            &mut self.metrics,
307        )?
308        else {
309            self.metrics.scan_cost += start.elapsed();
310            return Ok(None);
311        };
312
313        // Update metrics
314        self.metrics.num_batches += 1;
315        self.metrics.num_rows += filtered_batch.num_rows();
316        self.metrics.scan_cost += start.elapsed();
317
318        Ok(Some(filtered_batch))
319    }
320}
321
322impl Iterator for BulkPartBatchIter {
323    type Item = error::Result<RecordBatch>;
324
325    fn next(&mut self) -> Option<Self::Item> {
326        // Process batches until we find a non-empty one or run out
327        while let Some(batch) = self.batches.pop_front() {
328            match self.process_batch(batch) {
329                Ok(Some(result)) => return Some(Ok(result)),
330                Ok(None) => continue, // This batch was filtered out, try next
331                Err(e) => {
332                    self.report_mem_scan_metrics();
333                    return Some(Err(e));
334                }
335            }
336        }
337
338        // No more batches
339        self.report_mem_scan_metrics();
340        None
341    }
342}
343
344impl Drop for BulkPartBatchIter {
345    fn drop(&mut self) {
346        common_telemetry::debug!(
347            "BulkPartBatchIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}",
348            self.context.region_id(),
349            self.metrics.total_series,
350            self.metrics.num_rows,
351            self.metrics.num_batches,
352            self.metrics.scan_cost,
353            self.metrics.prefilter_cost,
354            self.metrics.prefilter_rows_filtered
355        );
356
357        // Report MemScanMetrics if not already reported
358        self.report_mem_scan_metrics();
359
360        READ_ROWS_TOTAL
361            .with_label_values(&["bulk_memtable"])
362            .inc_by(self.metrics.num_rows as u64);
363        READ_STAGE_ELAPSED
364            .with_label_values(&["scan_memtable"])
365            .observe(self.metrics.scan_cost.as_secs_f64());
366    }
367}
368
369/// Applies both predicate filtering and sequence filtering in a single pass.
370/// Returns None if the filtered batch is empty.
371///
372/// # Panics
373/// Panics if the format is not flat.
374fn apply_combined_filters(
375    context: &BulkIterContext,
376    sequence: &Option<SequenceRange>,
377    record_batch: RecordBatch,
378    skip_fields: bool,
379    pk_filter: Option<&mut dyn PrimaryKeyFilter>,
380    metrics: &mut MemScanMetricsData,
381) -> error::Result<Option<RecordBatch>> {
382    // Apply PK prefilter on raw batch before convert_batch to reduce conversion overhead.
383    let record_batch = if let Some(pk_filter) = pk_filter {
384        let rows_before = record_batch.num_rows();
385        let prefilter_start = Instant::now();
386        let pk_col_idx = primary_key_column_index(record_batch.num_columns());
387        match prefilter_flat_batch_by_primary_key(record_batch, pk_col_idx, pk_filter)? {
388            Some(batch) => {
389                metrics.prefilter_cost += prefilter_start.elapsed();
390                metrics.prefilter_rows_filtered += rows_before - batch.num_rows();
391                batch
392            }
393            None => {
394                metrics.prefilter_cost += prefilter_start.elapsed();
395                metrics.prefilter_rows_filtered += rows_before;
396                return Ok(None);
397            }
398        }
399    } else {
400        record_batch
401    };
402
403    // Converts the format to the flat format.
404    let record_batch = context.read_format().convert_batch(record_batch, None)?;
405
406    let num_rows = record_batch.num_rows();
407    let mut combined_filter = None;
408    let mut tag_decode_state = TagDecodeState::new();
409
410    // Apply predicate filters using the shared method.
411    if !context.base.filters.is_empty() {
412        let predicate_mask = context.base.compute_filter_mask_flat(
413            &record_batch,
414            skip_fields,
415            &mut tag_decode_state,
416        )?;
417        // If predicate filters out the entire batch, return None early
418        let Some(mask) = predicate_mask else {
419            return Ok(None);
420        };
421        combined_filter = Some(BooleanArray::from(mask));
422    }
423
424    // Filters rows by the given `sequence`. Only preserves rows with sequence less than or equal to `sequence`.
425    if let Some(sequence) = sequence {
426        let sequence_column =
427            record_batch.column(sequence_column_index(record_batch.num_columns()));
428        let sequence_filter = sequence
429            .filter(&sequence_column)
430            .context(ComputeArrowSnafu)?;
431        // Combine with existing filter using AND operation
432        combined_filter = match combined_filter {
433            None => Some(sequence_filter),
434            Some(existing_filter) => {
435                let and_result = datatypes::arrow::compute::and(&existing_filter, &sequence_filter)
436                    .context(ComputeArrowSnafu)?;
437                Some(and_result)
438            }
439        };
440    }
441
442    // Apply the combined filter if any filters were applied
443    let Some(filter_array) = combined_filter else {
444        // No filters applied, return original batch
445        return Ok(Some(record_batch));
446    };
447    let select_count = filter_array.true_count();
448    if select_count == 0 {
449        return Ok(None);
450    }
451    if select_count == num_rows {
452        return Ok(Some(record_batch));
453    }
454    let filtered_batch =
455        datatypes::arrow::compute::filter_record_batch(&record_batch, &filter_array)
456            .context(ComputeArrowSnafu)?;
457
458    Ok(Some(filtered_batch))
459}
460
461#[cfg(test)]
462mod tests {
463    use std::sync::Arc;
464
465    use api::v1::SemanticType;
466    use datafusion_expr::{col, lit};
467    use datatypes::arrow::array::{
468        ArrayRef, BinaryArray, DictionaryArray, Int64Array, StringArray, UInt8Array, UInt32Array,
469        UInt64Array,
470    };
471    use datatypes::arrow::datatypes::{DataType, Field, Schema};
472    use datatypes::data_type::ConcreteDataType;
473    use datatypes::schema::ColumnSchema;
474    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
475    use store_api::storage::RegionId;
476    use table::predicate::Predicate;
477
478    use super::*;
479    use crate::memtable::bulk::context::BulkIterContext;
480    use crate::test_util::sst_util::new_primary_key;
481
482    #[test]
483    fn test_bulk_part_batch_iter() {
484        // Create a simple schema
485        let schema = Arc::new(Schema::new(vec![
486            Field::new("key1", DataType::Utf8, false),
487            Field::new("field1", DataType::Int64, false),
488            Field::new(
489                "timestamp",
490                DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
491                false,
492            ),
493            Field::new(
494                "__primary_key",
495                DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)),
496                false,
497            ),
498            Field::new("__sequence", DataType::UInt64, false),
499            Field::new("__op_type", DataType::UInt8, false),
500        ]));
501
502        // Create test data
503        let key1 = Arc::new(StringArray::from_iter_values(["key1", "key2", "key3"]));
504        let field1 = Arc::new(Int64Array::from(vec![11, 12, 13]));
505        let timestamp = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
506            vec![1000, 2000, 3000],
507        ));
508
509        // Create primary key dictionary array with properly encoded PKs
510        use datatypes::arrow::array::{BinaryArray, DictionaryArray, UInt32Array};
511        let pk1 = new_primary_key(&["key1"]);
512        let pk2 = new_primary_key(&["key2"]);
513        let pk3 = new_primary_key(&["key3"]);
514        let values = Arc::new(BinaryArray::from_iter_values([
515            pk1.as_slice(),
516            pk2.as_slice(),
517            pk3.as_slice(),
518        ]));
519        let keys = UInt32Array::from(vec![0, 1, 2]);
520        let primary_key = Arc::new(DictionaryArray::new(keys, values));
521
522        let sequence = Arc::new(UInt64Array::from(vec![1, 2, 3]));
523        let op_type = Arc::new(UInt8Array::from(vec![1, 1, 1])); // PUT operations
524
525        let record_batch = RecordBatch::try_new(
526            schema,
527            vec![
528                key1,
529                field1,
530                timestamp,
531                primary_key.clone(),
532                sequence,
533                op_type,
534            ],
535        )
536        .unwrap();
537
538        // Create a minimal region metadata for testing
539        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
540        builder
541            .push_column_metadata(ColumnMetadata {
542                column_schema: ColumnSchema::new(
543                    "key1",
544                    ConcreteDataType::string_datatype(),
545                    false,
546                ),
547                semantic_type: SemanticType::Tag,
548                column_id: 0,
549            })
550            .push_column_metadata(ColumnMetadata {
551                column_schema: ColumnSchema::new(
552                    "field1",
553                    ConcreteDataType::int64_datatype(),
554                    false,
555                ),
556                semantic_type: SemanticType::Field,
557                column_id: 1,
558            })
559            .push_column_metadata(ColumnMetadata {
560                column_schema: ColumnSchema::new(
561                    "timestamp",
562                    ConcreteDataType::timestamp_millisecond_datatype(),
563                    false,
564                ),
565                semantic_type: SemanticType::Timestamp,
566                column_id: 2,
567            })
568            .primary_key(vec![0]);
569
570        let region_metadata = builder.build().unwrap();
571
572        // Create context
573        let context = Arc::new(
574            BulkIterContext::new(
575                Arc::new(region_metadata.clone()),
576                None, // No projection
577                None, // No predicate
578                false,
579            )
580            .unwrap(),
581        );
582        // Iterates all rows.
583        let iter =
584            BulkPartBatchIter::from_single(record_batch.clone(), context.clone(), None, 0, None);
585        let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
586        assert_eq!(1, result.len());
587        assert_eq!(3, result[0].num_rows());
588        assert_eq!(6, result[0].num_columns(),);
589
590        // Creates iter with sequence filter (only include sequences <= 2)
591        let iter = BulkPartBatchIter::from_single(
592            record_batch.clone(),
593            context,
594            Some(SequenceRange::LtEq { max: 2 }),
595            0,
596            None,
597        );
598        let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
599        assert_eq!(1, result.len());
600        let expect_sequence = Arc::new(UInt64Array::from(vec![1, 2])) as ArrayRef;
601        assert_eq!(
602            &expect_sequence,
603            result[0].column(result[0].num_columns() - 2)
604        );
605        assert_eq!(6, result[0].num_columns());
606
607        let context = Arc::new(
608            BulkIterContext::new(
609                Arc::new(region_metadata),
610                Some(&[0, 2]),
611                Some(Predicate::new(vec![col("key1").eq(lit("key2"))])),
612                false,
613            )
614            .unwrap(),
615        );
616        // Creates iter with projection and predicate.
617        let iter =
618            BulkPartBatchIter::from_single(record_batch.clone(), context.clone(), None, 0, None);
619        let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
620        assert_eq!(1, result.len());
621        assert_eq!(1, result[0].num_rows());
622        assert_eq!(5, result[0].num_columns());
623        let expect_sequence = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
624        assert_eq!(
625            &expect_sequence,
626            result[0].column(result[0].num_columns() - 2)
627        );
628    }
629
630    #[test]
631    fn test_bulk_part_batch_iter_multiple_batches() {
632        // Create a simple schema
633        let schema = Arc::new(Schema::new(vec![
634            Field::new("key1", DataType::Utf8, false),
635            Field::new("field1", DataType::Int64, false),
636            Field::new(
637                "timestamp",
638                DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
639                false,
640            ),
641            Field::new(
642                "__primary_key",
643                DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)),
644                false,
645            ),
646            Field::new("__sequence", DataType::UInt64, false),
647            Field::new("__op_type", DataType::UInt8, false),
648        ]));
649
650        // Create first batch with 2 rows
651        let pk1 = new_primary_key(&["key1"]);
652        let pk2 = new_primary_key(&["key2"]);
653        let key1_1 = Arc::new(StringArray::from_iter_values(["key1", "key2"]));
654        let field1_1 = Arc::new(Int64Array::from(vec![11, 12]));
655        let timestamp_1 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
656            vec![1000, 2000],
657        ));
658        let values_1 = Arc::new(BinaryArray::from_iter_values([
659            pk1.as_slice(),
660            pk2.as_slice(),
661        ]));
662        let keys_1 = UInt32Array::from(vec![0, 1]);
663        let primary_key_1 = Arc::new(DictionaryArray::new(keys_1, values_1));
664        let sequence_1 = Arc::new(UInt64Array::from(vec![1, 2]));
665        let op_type_1 = Arc::new(UInt8Array::from(vec![1, 1]));
666
667        let batch1 = RecordBatch::try_new(
668            schema.clone(),
669            vec![
670                key1_1,
671                field1_1,
672                timestamp_1,
673                primary_key_1,
674                sequence_1,
675                op_type_1,
676            ],
677        )
678        .unwrap();
679
680        // Create second batch with 3 rows
681        let pk3 = new_primary_key(&["key3"]);
682        let pk4 = new_primary_key(&["key4"]);
683        let pk5 = new_primary_key(&["key5"]);
684        let key1_2 = Arc::new(StringArray::from_iter_values(["key3", "key4", "key5"]));
685        let field1_2 = Arc::new(Int64Array::from(vec![13, 14, 15]));
686        let timestamp_2 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
687            vec![3000, 4000, 5000],
688        ));
689        let values_2 = Arc::new(BinaryArray::from_iter_values([
690            pk3.as_slice(),
691            pk4.as_slice(),
692            pk5.as_slice(),
693        ]));
694        let keys_2 = UInt32Array::from(vec![0, 1, 2]);
695        let primary_key_2 = Arc::new(DictionaryArray::new(keys_2, values_2));
696        let sequence_2 = Arc::new(UInt64Array::from(vec![3, 4, 5]));
697        let op_type_2 = Arc::new(UInt8Array::from(vec![1, 1, 1]));
698
699        let batch2 = RecordBatch::try_new(
700            schema.clone(),
701            vec![
702                key1_2,
703                field1_2,
704                timestamp_2,
705                primary_key_2,
706                sequence_2,
707                op_type_2,
708            ],
709        )
710        .unwrap();
711
712        // Create region metadata
713        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
714        builder
715            .push_column_metadata(ColumnMetadata {
716                column_schema: ColumnSchema::new(
717                    "key1",
718                    ConcreteDataType::string_datatype(),
719                    false,
720                ),
721                semantic_type: SemanticType::Tag,
722                column_id: 0,
723            })
724            .push_column_metadata(ColumnMetadata {
725                column_schema: ColumnSchema::new(
726                    "field1",
727                    ConcreteDataType::int64_datatype(),
728                    false,
729                ),
730                semantic_type: SemanticType::Field,
731                column_id: 1,
732            })
733            .push_column_metadata(ColumnMetadata {
734                column_schema: ColumnSchema::new(
735                    "timestamp",
736                    ConcreteDataType::timestamp_millisecond_datatype(),
737                    false,
738                ),
739                semantic_type: SemanticType::Timestamp,
740                column_id: 2,
741            })
742            .primary_key(vec![0]);
743
744        let region_metadata = builder.build().unwrap();
745
746        // Create context
747        let context = Arc::new(
748            BulkIterContext::new(
749                Arc::new(region_metadata),
750                None, // No projection
751                None, // No predicate
752                false,
753            )
754            .unwrap(),
755        );
756
757        // Create iterator with multiple batches
758        let expect_batches = vec![batch1, batch2];
759        let iter = BulkPartBatchIter::new(expect_batches.clone(), context.clone(), None, 0, None);
760
761        // Collect all results
762        let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
763        assert_eq!(expect_batches, result);
764    }
765}