1use std::collections::VecDeque;
16use std::time::Instant;
17
18use datatypes::arrow::array::BooleanArray;
19use datatypes::arrow::record_batch::RecordBatch;
20use mito_codec::row_converter::PrimaryKeyFilter;
21use parquet::arrow::ProjectionMask;
22use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
23use snafu::ResultExt;
24use store_api::storage::SequenceRange;
25
26use crate::error::{self, ComputeArrowSnafu, DecodeArrowRowGroupSnafu};
27use crate::memtable::bulk::context::{BulkIterContext, BulkIterContextRef};
28use crate::memtable::bulk::part::EncodedBulkPart;
29use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder;
30use crate::memtable::{MemScanMetrics, MemScanMetricsData};
31use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
32use crate::sst::parquet::file_range::TagDecodeState;
33use crate::sst::parquet::flat_format::{primary_key_column_index, sequence_column_index};
34use crate::sst::parquet::prefilter::{CachedPrimaryKeyFilter, prefilter_flat_batch_by_primary_key};
35
36pub struct EncodedBulkPartIter {
38 context: BulkIterContextRef,
39 row_groups_to_read: VecDeque<usize>,
40 current_reader: Option<ParquetRecordBatchReader>,
41 builder: MemtableRowGroupReaderBuilder,
42 sequence: Option<SequenceRange>,
44 current_skip_fields: bool,
46 pk_filter: Option<CachedPrimaryKeyFilter>,
48 metrics: MemScanMetricsData,
50 mem_scan_metrics: Option<MemScanMetrics>,
52}
53
54impl EncodedBulkPartIter {
55 pub fn try_new(
57 encoded_part: &EncodedBulkPart,
58 context: BulkIterContextRef,
59 mut row_groups_to_read: VecDeque<usize>,
60 sequence: Option<SequenceRange>,
61 mem_scan_metrics: Option<MemScanMetrics>,
62 ) -> error::Result<Self> {
63 let parquet_meta = encoded_part.metadata().parquet_metadata.clone();
64 let data = encoded_part.data().clone();
65 let series_count = encoded_part.metadata().num_series as usize;
66
67 let projection_mask = ProjectionMask::roots(
68 parquet_meta.file_metadata().schema_descr(),
69 context.read_format().projection_indices().iter().copied(),
70 );
71 let builder =
72 MemtableRowGroupReaderBuilder::try_new(&context, projection_mask, parquet_meta, data)?;
73
74 let pk_filter = context.build_pk_filter();
76
77 let (init_reader, current_skip_fields) = match row_groups_to_read.pop_front() {
78 Some(first_row_group) => {
79 let skip_fields = context.pre_filter_mode().skip_fields();
80 let reader = builder.build_row_group_reader(first_row_group, None)?;
81 (Some(reader), skip_fields)
82 }
83 None => (None, false),
84 };
85
86 Ok(Self {
87 context,
88 row_groups_to_read,
89 current_reader: init_reader,
90 builder,
91 sequence,
92 current_skip_fields,
93 pk_filter,
94 metrics: MemScanMetricsData {
95 total_series: series_count,
96 ..Default::default()
97 },
98 mem_scan_metrics,
99 })
100 }
101
102 fn report_mem_scan_metrics(&mut self) {
103 if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() {
104 mem_scan_metrics.merge_inner(&self.metrics);
105 }
106 }
107
108 pub(crate) fn next_record_batch(&mut self) -> error::Result<Option<RecordBatch>> {
110 let start = Instant::now();
111
112 let Some(current) = &mut self.current_reader else {
113 self.metrics.scan_cost += start.elapsed();
115 return Ok(None);
116 };
117
118 for batch in current {
119 let batch = batch.context(DecodeArrowRowGroupSnafu)?;
120 if let Some(batch) = apply_combined_filters(
121 &self.context,
122 &self.sequence,
123 batch,
124 self.current_skip_fields,
125 self.pk_filter
126 .as_mut()
127 .map(|f| f as &mut dyn PrimaryKeyFilter),
128 &mut self.metrics,
129 )? {
130 self.metrics.num_batches += 1;
132 self.metrics.num_rows += batch.num_rows();
133 self.metrics.scan_cost += start.elapsed();
134 return Ok(Some(batch));
135 }
136 }
137
138 while let Some(next_row_group) = self.row_groups_to_read.pop_front() {
140 self.current_skip_fields = self.context.pre_filter_mode().skip_fields();
142
143 let next_reader = self.builder.build_row_group_reader(next_row_group, None)?;
144 let current = self.current_reader.insert(next_reader);
145
146 for batch in current {
147 let batch = batch.context(DecodeArrowRowGroupSnafu)?;
148 if let Some(batch) = apply_combined_filters(
149 &self.context,
150 &self.sequence,
151 batch,
152 self.current_skip_fields,
153 self.pk_filter
154 .as_mut()
155 .map(|f| f as &mut dyn PrimaryKeyFilter),
156 &mut self.metrics,
157 )? {
158 self.metrics.num_batches += 1;
160 self.metrics.num_rows += batch.num_rows();
161 self.metrics.scan_cost += start.elapsed();
162 return Ok(Some(batch));
163 }
164 }
165 }
166
167 self.metrics.scan_cost += start.elapsed();
168 Ok(None)
169 }
170}
171
172impl Iterator for EncodedBulkPartIter {
173 type Item = error::Result<RecordBatch>;
174
175 fn next(&mut self) -> Option<Self::Item> {
176 let result = self.next_record_batch().transpose();
177
178 if result.is_none() {
180 self.report_mem_scan_metrics();
181 }
182
183 result
184 }
185}
186
187impl Drop for EncodedBulkPartIter {
188 fn drop(&mut self) {
189 common_telemetry::debug!(
190 "EncodedBulkPartIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}",
191 self.context.region_id(),
192 self.metrics.total_series,
193 self.metrics.num_rows,
194 self.metrics.num_batches,
195 self.metrics.scan_cost,
196 self.metrics.prefilter_cost,
197 self.metrics.prefilter_rows_filtered
198 );
199
200 self.report_mem_scan_metrics();
202
203 READ_ROWS_TOTAL
204 .with_label_values(&["bulk_memtable"])
205 .inc_by(self.metrics.num_rows as u64);
206 READ_STAGE_ELAPSED
207 .with_label_values(&["scan_memtable"])
208 .observe(self.metrics.scan_cost.as_secs_f64());
209 }
210}
211
212pub struct BulkPartBatchIter {
216 batches: VecDeque<RecordBatch>,
218 context: BulkIterContextRef,
220 sequence: Option<SequenceRange>,
222 pk_filter: Option<CachedPrimaryKeyFilter>,
224 metrics: MemScanMetricsData,
226 mem_scan_metrics: Option<MemScanMetrics>,
228}
229
230impl BulkPartBatchIter {
231 pub fn new(
233 batches: Vec<RecordBatch>,
234 context: BulkIterContextRef,
235 sequence: Option<SequenceRange>,
236 series_count: usize,
237 mem_scan_metrics: Option<MemScanMetrics>,
238 ) -> Self {
239 let pk_filter = context.build_pk_filter();
240
241 Self {
242 batches: VecDeque::from(batches),
243 context,
244 sequence,
245 pk_filter,
246 metrics: MemScanMetricsData {
247 total_series: series_count,
248 ..Default::default()
249 },
250 mem_scan_metrics,
251 }
252 }
253
254 pub fn from_single(
256 record_batch: RecordBatch,
257 context: BulkIterContextRef,
258 sequence: Option<SequenceRange>,
259 series_count: usize,
260 mem_scan_metrics: Option<MemScanMetrics>,
261 ) -> Self {
262 Self::new(
263 vec![record_batch],
264 context,
265 sequence,
266 series_count,
267 mem_scan_metrics,
268 )
269 }
270
271 fn report_mem_scan_metrics(&mut self) {
272 if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() {
273 mem_scan_metrics.merge_inner(&self.metrics);
274 }
275 }
276
277 fn apply_projection(&self, record_batch: RecordBatch) -> error::Result<RecordBatch> {
279 let projection_indices = self.context.read_format().projection_indices();
280 if projection_indices.len() == record_batch.num_columns() {
281 return Ok(record_batch);
282 }
283
284 record_batch
285 .project(projection_indices)
286 .context(ComputeArrowSnafu)
287 }
288
289 fn process_batch(&mut self, record_batch: RecordBatch) -> error::Result<Option<RecordBatch>> {
290 let start = Instant::now();
291
292 let projected_batch = self.apply_projection(record_batch)?;
294
295 let skip_fields = self.context.pre_filter_mode().skip_fields();
297
298 let Some(filtered_batch) = apply_combined_filters(
299 &self.context,
300 &self.sequence,
301 projected_batch,
302 skip_fields,
303 self.pk_filter
304 .as_mut()
305 .map(|f| f as &mut dyn PrimaryKeyFilter),
306 &mut self.metrics,
307 )?
308 else {
309 self.metrics.scan_cost += start.elapsed();
310 return Ok(None);
311 };
312
313 self.metrics.num_batches += 1;
315 self.metrics.num_rows += filtered_batch.num_rows();
316 self.metrics.scan_cost += start.elapsed();
317
318 Ok(Some(filtered_batch))
319 }
320}
321
322impl Iterator for BulkPartBatchIter {
323 type Item = error::Result<RecordBatch>;
324
325 fn next(&mut self) -> Option<Self::Item> {
326 while let Some(batch) = self.batches.pop_front() {
328 match self.process_batch(batch) {
329 Ok(Some(result)) => return Some(Ok(result)),
330 Ok(None) => continue, Err(e) => {
332 self.report_mem_scan_metrics();
333 return Some(Err(e));
334 }
335 }
336 }
337
338 self.report_mem_scan_metrics();
340 None
341 }
342}
343
344impl Drop for BulkPartBatchIter {
345 fn drop(&mut self) {
346 common_telemetry::debug!(
347 "BulkPartBatchIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}",
348 self.context.region_id(),
349 self.metrics.total_series,
350 self.metrics.num_rows,
351 self.metrics.num_batches,
352 self.metrics.scan_cost,
353 self.metrics.prefilter_cost,
354 self.metrics.prefilter_rows_filtered
355 );
356
357 self.report_mem_scan_metrics();
359
360 READ_ROWS_TOTAL
361 .with_label_values(&["bulk_memtable"])
362 .inc_by(self.metrics.num_rows as u64);
363 READ_STAGE_ELAPSED
364 .with_label_values(&["scan_memtable"])
365 .observe(self.metrics.scan_cost.as_secs_f64());
366 }
367}
368
369fn apply_combined_filters(
375 context: &BulkIterContext,
376 sequence: &Option<SequenceRange>,
377 record_batch: RecordBatch,
378 skip_fields: bool,
379 pk_filter: Option<&mut dyn PrimaryKeyFilter>,
380 metrics: &mut MemScanMetricsData,
381) -> error::Result<Option<RecordBatch>> {
382 let record_batch = if let Some(pk_filter) = pk_filter {
384 let rows_before = record_batch.num_rows();
385 let prefilter_start = Instant::now();
386 let pk_col_idx = primary_key_column_index(record_batch.num_columns());
387 match prefilter_flat_batch_by_primary_key(record_batch, pk_col_idx, pk_filter)? {
388 Some(batch) => {
389 metrics.prefilter_cost += prefilter_start.elapsed();
390 metrics.prefilter_rows_filtered += rows_before - batch.num_rows();
391 batch
392 }
393 None => {
394 metrics.prefilter_cost += prefilter_start.elapsed();
395 metrics.prefilter_rows_filtered += rows_before;
396 return Ok(None);
397 }
398 }
399 } else {
400 record_batch
401 };
402
403 let record_batch = context.read_format().convert_batch(record_batch, None)?;
405
406 let num_rows = record_batch.num_rows();
407 let mut combined_filter = None;
408 let mut tag_decode_state = TagDecodeState::new();
409
410 if !context.base.filters.is_empty() {
412 let predicate_mask = context.base.compute_filter_mask_flat(
413 &record_batch,
414 skip_fields,
415 &mut tag_decode_state,
416 )?;
417 let Some(mask) = predicate_mask else {
419 return Ok(None);
420 };
421 combined_filter = Some(BooleanArray::from(mask));
422 }
423
424 if let Some(sequence) = sequence {
426 let sequence_column =
427 record_batch.column(sequence_column_index(record_batch.num_columns()));
428 let sequence_filter = sequence
429 .filter(&sequence_column)
430 .context(ComputeArrowSnafu)?;
431 combined_filter = match combined_filter {
433 None => Some(sequence_filter),
434 Some(existing_filter) => {
435 let and_result = datatypes::arrow::compute::and(&existing_filter, &sequence_filter)
436 .context(ComputeArrowSnafu)?;
437 Some(and_result)
438 }
439 };
440 }
441
442 let Some(filter_array) = combined_filter else {
444 return Ok(Some(record_batch));
446 };
447 let select_count = filter_array.true_count();
448 if select_count == 0 {
449 return Ok(None);
450 }
451 if select_count == num_rows {
452 return Ok(Some(record_batch));
453 }
454 let filtered_batch =
455 datatypes::arrow::compute::filter_record_batch(&record_batch, &filter_array)
456 .context(ComputeArrowSnafu)?;
457
458 Ok(Some(filtered_batch))
459}
460
461#[cfg(test)]
462mod tests {
463 use std::sync::Arc;
464
465 use api::v1::SemanticType;
466 use datafusion_expr::{col, lit};
467 use datatypes::arrow::array::{
468 ArrayRef, BinaryArray, DictionaryArray, Int64Array, StringArray, UInt8Array, UInt32Array,
469 UInt64Array,
470 };
471 use datatypes::arrow::datatypes::{DataType, Field, Schema};
472 use datatypes::data_type::ConcreteDataType;
473 use datatypes::schema::ColumnSchema;
474 use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
475 use store_api::storage::RegionId;
476 use table::predicate::Predicate;
477
478 use super::*;
479 use crate::memtable::bulk::context::BulkIterContext;
480 use crate::test_util::sst_util::new_primary_key;
481
482 #[test]
483 fn test_bulk_part_batch_iter() {
484 let schema = Arc::new(Schema::new(vec![
486 Field::new("key1", DataType::Utf8, false),
487 Field::new("field1", DataType::Int64, false),
488 Field::new(
489 "timestamp",
490 DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
491 false,
492 ),
493 Field::new(
494 "__primary_key",
495 DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)),
496 false,
497 ),
498 Field::new("__sequence", DataType::UInt64, false),
499 Field::new("__op_type", DataType::UInt8, false),
500 ]));
501
502 let key1 = Arc::new(StringArray::from_iter_values(["key1", "key2", "key3"]));
504 let field1 = Arc::new(Int64Array::from(vec![11, 12, 13]));
505 let timestamp = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
506 vec![1000, 2000, 3000],
507 ));
508
509 use datatypes::arrow::array::{BinaryArray, DictionaryArray, UInt32Array};
511 let pk1 = new_primary_key(&["key1"]);
512 let pk2 = new_primary_key(&["key2"]);
513 let pk3 = new_primary_key(&["key3"]);
514 let values = Arc::new(BinaryArray::from_iter_values([
515 pk1.as_slice(),
516 pk2.as_slice(),
517 pk3.as_slice(),
518 ]));
519 let keys = UInt32Array::from(vec![0, 1, 2]);
520 let primary_key = Arc::new(DictionaryArray::new(keys, values));
521
522 let sequence = Arc::new(UInt64Array::from(vec![1, 2, 3]));
523 let op_type = Arc::new(UInt8Array::from(vec![1, 1, 1])); let record_batch = RecordBatch::try_new(
526 schema,
527 vec![
528 key1,
529 field1,
530 timestamp,
531 primary_key.clone(),
532 sequence,
533 op_type,
534 ],
535 )
536 .unwrap();
537
538 let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
540 builder
541 .push_column_metadata(ColumnMetadata {
542 column_schema: ColumnSchema::new(
543 "key1",
544 ConcreteDataType::string_datatype(),
545 false,
546 ),
547 semantic_type: SemanticType::Tag,
548 column_id: 0,
549 })
550 .push_column_metadata(ColumnMetadata {
551 column_schema: ColumnSchema::new(
552 "field1",
553 ConcreteDataType::int64_datatype(),
554 false,
555 ),
556 semantic_type: SemanticType::Field,
557 column_id: 1,
558 })
559 .push_column_metadata(ColumnMetadata {
560 column_schema: ColumnSchema::new(
561 "timestamp",
562 ConcreteDataType::timestamp_millisecond_datatype(),
563 false,
564 ),
565 semantic_type: SemanticType::Timestamp,
566 column_id: 2,
567 })
568 .primary_key(vec![0]);
569
570 let region_metadata = builder.build().unwrap();
571
572 let context = Arc::new(
574 BulkIterContext::new(
575 Arc::new(region_metadata.clone()),
576 None, None, false,
579 )
580 .unwrap(),
581 );
582 let iter =
584 BulkPartBatchIter::from_single(record_batch.clone(), context.clone(), None, 0, None);
585 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
586 assert_eq!(1, result.len());
587 assert_eq!(3, result[0].num_rows());
588 assert_eq!(6, result[0].num_columns(),);
589
590 let iter = BulkPartBatchIter::from_single(
592 record_batch.clone(),
593 context,
594 Some(SequenceRange::LtEq { max: 2 }),
595 0,
596 None,
597 );
598 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
599 assert_eq!(1, result.len());
600 let expect_sequence = Arc::new(UInt64Array::from(vec![1, 2])) as ArrayRef;
601 assert_eq!(
602 &expect_sequence,
603 result[0].column(result[0].num_columns() - 2)
604 );
605 assert_eq!(6, result[0].num_columns());
606
607 let context = Arc::new(
608 BulkIterContext::new(
609 Arc::new(region_metadata),
610 Some(&[0, 2]),
611 Some(Predicate::new(vec![col("key1").eq(lit("key2"))])),
612 false,
613 )
614 .unwrap(),
615 );
616 let iter =
618 BulkPartBatchIter::from_single(record_batch.clone(), context.clone(), None, 0, None);
619 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
620 assert_eq!(1, result.len());
621 assert_eq!(1, result[0].num_rows());
622 assert_eq!(5, result[0].num_columns());
623 let expect_sequence = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
624 assert_eq!(
625 &expect_sequence,
626 result[0].column(result[0].num_columns() - 2)
627 );
628 }
629
630 #[test]
631 fn test_bulk_part_batch_iter_multiple_batches() {
632 let schema = Arc::new(Schema::new(vec![
634 Field::new("key1", DataType::Utf8, false),
635 Field::new("field1", DataType::Int64, false),
636 Field::new(
637 "timestamp",
638 DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
639 false,
640 ),
641 Field::new(
642 "__primary_key",
643 DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)),
644 false,
645 ),
646 Field::new("__sequence", DataType::UInt64, false),
647 Field::new("__op_type", DataType::UInt8, false),
648 ]));
649
650 let pk1 = new_primary_key(&["key1"]);
652 let pk2 = new_primary_key(&["key2"]);
653 let key1_1 = Arc::new(StringArray::from_iter_values(["key1", "key2"]));
654 let field1_1 = Arc::new(Int64Array::from(vec![11, 12]));
655 let timestamp_1 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
656 vec![1000, 2000],
657 ));
658 let values_1 = Arc::new(BinaryArray::from_iter_values([
659 pk1.as_slice(),
660 pk2.as_slice(),
661 ]));
662 let keys_1 = UInt32Array::from(vec![0, 1]);
663 let primary_key_1 = Arc::new(DictionaryArray::new(keys_1, values_1));
664 let sequence_1 = Arc::new(UInt64Array::from(vec![1, 2]));
665 let op_type_1 = Arc::new(UInt8Array::from(vec![1, 1]));
666
667 let batch1 = RecordBatch::try_new(
668 schema.clone(),
669 vec![
670 key1_1,
671 field1_1,
672 timestamp_1,
673 primary_key_1,
674 sequence_1,
675 op_type_1,
676 ],
677 )
678 .unwrap();
679
680 let pk3 = new_primary_key(&["key3"]);
682 let pk4 = new_primary_key(&["key4"]);
683 let pk5 = new_primary_key(&["key5"]);
684 let key1_2 = Arc::new(StringArray::from_iter_values(["key3", "key4", "key5"]));
685 let field1_2 = Arc::new(Int64Array::from(vec![13, 14, 15]));
686 let timestamp_2 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
687 vec![3000, 4000, 5000],
688 ));
689 let values_2 = Arc::new(BinaryArray::from_iter_values([
690 pk3.as_slice(),
691 pk4.as_slice(),
692 pk5.as_slice(),
693 ]));
694 let keys_2 = UInt32Array::from(vec![0, 1, 2]);
695 let primary_key_2 = Arc::new(DictionaryArray::new(keys_2, values_2));
696 let sequence_2 = Arc::new(UInt64Array::from(vec![3, 4, 5]));
697 let op_type_2 = Arc::new(UInt8Array::from(vec![1, 1, 1]));
698
699 let batch2 = RecordBatch::try_new(
700 schema.clone(),
701 vec![
702 key1_2,
703 field1_2,
704 timestamp_2,
705 primary_key_2,
706 sequence_2,
707 op_type_2,
708 ],
709 )
710 .unwrap();
711
712 let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
714 builder
715 .push_column_metadata(ColumnMetadata {
716 column_schema: ColumnSchema::new(
717 "key1",
718 ConcreteDataType::string_datatype(),
719 false,
720 ),
721 semantic_type: SemanticType::Tag,
722 column_id: 0,
723 })
724 .push_column_metadata(ColumnMetadata {
725 column_schema: ColumnSchema::new(
726 "field1",
727 ConcreteDataType::int64_datatype(),
728 false,
729 ),
730 semantic_type: SemanticType::Field,
731 column_id: 1,
732 })
733 .push_column_metadata(ColumnMetadata {
734 column_schema: ColumnSchema::new(
735 "timestamp",
736 ConcreteDataType::timestamp_millisecond_datatype(),
737 false,
738 ),
739 semantic_type: SemanticType::Timestamp,
740 column_id: 2,
741 })
742 .primary_key(vec![0]);
743
744 let region_metadata = builder.build().unwrap();
745
746 let context = Arc::new(
748 BulkIterContext::new(
749 Arc::new(region_metadata),
750 None, None, false,
753 )
754 .unwrap(),
755 );
756
757 let expect_batches = vec![batch1, batch2];
759 let iter = BulkPartBatchIter::new(expect_batches.clone(), context.clone(), None, 0, None);
760
761 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
763 assert_eq!(expect_batches, result);
764 }
765}