1use std::collections::VecDeque;
16use std::time::Instant;
17
18use datatypes::arrow::array::BooleanArray;
19use datatypes::arrow::record_batch::RecordBatch;
20use mito_codec::row_converter::PrimaryKeyFilter;
21use parquet::arrow::ProjectionMask;
22use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
23use snafu::ResultExt;
24use store_api::storage::SequenceRange;
25
26use crate::error::{self, ComputeArrowSnafu, DecodeArrowRowGroupSnafu};
27use crate::memtable::bulk::context::{BulkIterContext, BulkIterContextRef};
28use crate::memtable::bulk::part::EncodedBulkPart;
29use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder;
30use crate::memtable::{MemScanMetrics, MemScanMetricsData};
31use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
32use crate::sst::parquet::file_range::TagDecodeState;
33use crate::sst::parquet::flat_format::{primary_key_column_index, sequence_column_index};
34use crate::sst::parquet::prefilter::{CachedPrimaryKeyFilter, prefilter_flat_batch_by_primary_key};
35
36pub struct EncodedBulkPartIter {
38 context: BulkIterContextRef,
39 row_groups_to_read: VecDeque<usize>,
40 current_reader: Option<ParquetRecordBatchReader>,
41 builder: MemtableRowGroupReaderBuilder,
42 sequence: Option<SequenceRange>,
44 current_skip_fields: bool,
46 pk_filter: Option<CachedPrimaryKeyFilter>,
48 metrics: MemScanMetricsData,
50 mem_scan_metrics: Option<MemScanMetrics>,
52}
53
54impl EncodedBulkPartIter {
55 pub fn try_new(
57 encoded_part: &EncodedBulkPart,
58 context: BulkIterContextRef,
59 mut row_groups_to_read: VecDeque<usize>,
60 sequence: Option<SequenceRange>,
61 mem_scan_metrics: Option<MemScanMetrics>,
62 ) -> error::Result<Self> {
63 let parquet_meta = encoded_part.metadata().parquet_metadata.clone();
64 let data = encoded_part.data().clone();
65 let series_count = encoded_part.metadata().num_series as usize;
66
67 let projection_mask = ProjectionMask::roots(
68 parquet_meta.file_metadata().schema_descr(),
69 context.read_format().projection_indices().iter().copied(),
70 );
71 let builder =
72 MemtableRowGroupReaderBuilder::try_new(&context, projection_mask, parquet_meta, data)?;
73
74 let pk_filter = context.build_pk_filter();
76
77 let (init_reader, current_skip_fields) = match row_groups_to_read.pop_front() {
78 Some(first_row_group) => {
79 let skip_fields = context.pre_filter_mode().skip_fields();
80 let reader = builder.build_row_group_reader(first_row_group, None)?;
81 (Some(reader), skip_fields)
82 }
83 None => (None, false),
84 };
85
86 Ok(Self {
87 context,
88 row_groups_to_read,
89 current_reader: init_reader,
90 builder,
91 sequence,
92 current_skip_fields,
93 pk_filter,
94 metrics: MemScanMetricsData {
95 total_series: series_count,
96 ..Default::default()
97 },
98 mem_scan_metrics,
99 })
100 }
101
102 fn report_mem_scan_metrics(&mut self) {
103 if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() {
104 mem_scan_metrics.merge_inner(&self.metrics);
105 }
106 }
107
108 pub(crate) fn next_record_batch(&mut self) -> error::Result<Option<RecordBatch>> {
110 let start = Instant::now();
111
112 let Some(current) = &mut self.current_reader else {
113 self.metrics.scan_cost += start.elapsed();
115 return Ok(None);
116 };
117
118 for batch in current {
119 let batch = batch.context(DecodeArrowRowGroupSnafu)?;
120 if let Some(batch) = apply_combined_filters(
121 &self.context,
122 &self.sequence,
123 batch,
124 self.current_skip_fields,
125 self.pk_filter
126 .as_mut()
127 .map(|f| f as &mut dyn PrimaryKeyFilter),
128 &mut self.metrics,
129 )? {
130 self.metrics.num_batches += 1;
132 self.metrics.num_rows += batch.num_rows();
133 self.metrics.scan_cost += start.elapsed();
134 return Ok(Some(batch));
135 }
136 }
137
138 while let Some(next_row_group) = self.row_groups_to_read.pop_front() {
140 self.current_skip_fields = self.context.pre_filter_mode().skip_fields();
142
143 let next_reader = self.builder.build_row_group_reader(next_row_group, None)?;
144 let current = self.current_reader.insert(next_reader);
145
146 for batch in current {
147 let batch = batch.context(DecodeArrowRowGroupSnafu)?;
148 if let Some(batch) = apply_combined_filters(
149 &self.context,
150 &self.sequence,
151 batch,
152 self.current_skip_fields,
153 self.pk_filter
154 .as_mut()
155 .map(|f| f as &mut dyn PrimaryKeyFilter),
156 &mut self.metrics,
157 )? {
158 self.metrics.num_batches += 1;
160 self.metrics.num_rows += batch.num_rows();
161 self.metrics.scan_cost += start.elapsed();
162 return Ok(Some(batch));
163 }
164 }
165 }
166
167 self.metrics.scan_cost += start.elapsed();
168 Ok(None)
169 }
170}
171
172impl Iterator for EncodedBulkPartIter {
173 type Item = error::Result<RecordBatch>;
174
175 fn next(&mut self) -> Option<Self::Item> {
176 let result = self.next_record_batch().transpose();
177
178 if result.is_none() {
180 self.report_mem_scan_metrics();
181 }
182
183 result
184 }
185}
186
187impl Drop for EncodedBulkPartIter {
188 fn drop(&mut self) {
189 common_telemetry::debug!(
190 "EncodedBulkPartIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}",
191 self.context.region_id(),
192 self.metrics.total_series,
193 self.metrics.num_rows,
194 self.metrics.num_batches,
195 self.metrics.scan_cost,
196 self.metrics.prefilter_cost,
197 self.metrics.prefilter_rows_filtered
198 );
199
200 self.report_mem_scan_metrics();
202
203 READ_ROWS_TOTAL
204 .with_label_values(&["bulk_memtable"])
205 .inc_by(self.metrics.num_rows as u64);
206 READ_STAGE_ELAPSED
207 .with_label_values(&["scan_memtable"])
208 .observe(self.metrics.scan_cost.as_secs_f64());
209 }
210}
211
212pub struct BulkPartBatchIter {
216 batches: VecDeque<RecordBatch>,
218 context: BulkIterContextRef,
220 sequence: Option<SequenceRange>,
222 pk_filter: Option<CachedPrimaryKeyFilter>,
224 metrics: MemScanMetricsData,
226 mem_scan_metrics: Option<MemScanMetrics>,
228}
229
230impl BulkPartBatchIter {
231 pub fn new(
233 batches: Vec<RecordBatch>,
234 context: BulkIterContextRef,
235 sequence: Option<SequenceRange>,
236 series_count: usize,
237 mem_scan_metrics: Option<MemScanMetrics>,
238 ) -> Self {
239 let pk_filter = context.build_pk_filter();
240
241 Self {
242 batches: VecDeque::from(batches),
243 context,
244 sequence,
245 pk_filter,
246 metrics: MemScanMetricsData {
247 total_series: series_count,
248 ..Default::default()
249 },
250 mem_scan_metrics,
251 }
252 }
253
254 pub fn from_single(
256 record_batch: RecordBatch,
257 context: BulkIterContextRef,
258 sequence: Option<SequenceRange>,
259 series_count: usize,
260 mem_scan_metrics: Option<MemScanMetrics>,
261 ) -> Self {
262 Self::new(
263 vec![record_batch],
264 context,
265 sequence,
266 series_count,
267 mem_scan_metrics,
268 )
269 }
270
271 fn report_mem_scan_metrics(&mut self) {
272 if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() {
273 mem_scan_metrics.merge_inner(&self.metrics);
274 }
275 }
276
277 fn apply_projection(&self, record_batch: RecordBatch) -> error::Result<RecordBatch> {
279 let projection_indices = self.context.read_format().projection_indices();
280 if projection_indices.len() == record_batch.num_columns() {
281 return Ok(record_batch);
282 }
283
284 record_batch
285 .project(projection_indices)
286 .context(ComputeArrowSnafu)
287 }
288
289 fn process_batch(&mut self, record_batch: RecordBatch) -> error::Result<Option<RecordBatch>> {
290 let start = Instant::now();
291
292 let projected_batch = self.apply_projection(record_batch)?;
294
295 let skip_fields = self.context.pre_filter_mode().skip_fields();
297
298 let Some(filtered_batch) = apply_combined_filters(
299 &self.context,
300 &self.sequence,
301 projected_batch,
302 skip_fields,
303 self.pk_filter
304 .as_mut()
305 .map(|f| f as &mut dyn PrimaryKeyFilter),
306 &mut self.metrics,
307 )?
308 else {
309 self.metrics.scan_cost += start.elapsed();
310 return Ok(None);
311 };
312
313 self.metrics.num_batches += 1;
315 self.metrics.num_rows += filtered_batch.num_rows();
316 self.metrics.scan_cost += start.elapsed();
317
318 Ok(Some(filtered_batch))
319 }
320}
321
322impl Iterator for BulkPartBatchIter {
323 type Item = error::Result<RecordBatch>;
324
325 fn next(&mut self) -> Option<Self::Item> {
326 while let Some(batch) = self.batches.pop_front() {
328 match self.process_batch(batch) {
329 Ok(Some(result)) => return Some(Ok(result)),
330 Ok(None) => continue, Err(e) => {
332 self.report_mem_scan_metrics();
333 return Some(Err(e));
334 }
335 }
336 }
337
338 self.report_mem_scan_metrics();
340 None
341 }
342}
343
344impl Drop for BulkPartBatchIter {
345 fn drop(&mut self) {
346 common_telemetry::debug!(
347 "BulkPartBatchIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}, prefilter_cost={:?}, prefilter_rows_filtered={}",
348 self.context.region_id(),
349 self.metrics.total_series,
350 self.metrics.num_rows,
351 self.metrics.num_batches,
352 self.metrics.scan_cost,
353 self.metrics.prefilter_cost,
354 self.metrics.prefilter_rows_filtered
355 );
356
357 self.report_mem_scan_metrics();
359
360 READ_ROWS_TOTAL
361 .with_label_values(&["bulk_memtable"])
362 .inc_by(self.metrics.num_rows as u64);
363 READ_STAGE_ELAPSED
364 .with_label_values(&["scan_memtable"])
365 .observe(self.metrics.scan_cost.as_secs_f64());
366 }
367}
368
369fn apply_combined_filters(
375 context: &BulkIterContext,
376 sequence: &Option<SequenceRange>,
377 record_batch: RecordBatch,
378 skip_fields: bool,
379 pk_filter: Option<&mut dyn PrimaryKeyFilter>,
380 metrics: &mut MemScanMetricsData,
381) -> error::Result<Option<RecordBatch>> {
382 let has_pk_prefilter = pk_filter.is_some();
384 let record_batch = if let Some(pk_filter) = pk_filter {
385 let rows_before = record_batch.num_rows();
386 let prefilter_start = Instant::now();
387 let pk_col_idx = primary_key_column_index(record_batch.num_columns());
388 match prefilter_flat_batch_by_primary_key(record_batch, pk_col_idx, pk_filter)? {
389 Some(batch) => {
390 metrics.prefilter_cost += prefilter_start.elapsed();
391 metrics.prefilter_rows_filtered += rows_before - batch.num_rows();
392 batch
393 }
394 None => {
395 metrics.prefilter_cost += prefilter_start.elapsed();
396 metrics.prefilter_rows_filtered += rows_before;
397 return Ok(None);
398 }
399 }
400 } else {
401 record_batch
402 };
403
404 let record_batch = context.read_format().convert_batch(record_batch, None)?;
406
407 let num_rows = record_batch.num_rows();
408 let mut combined_filter = None;
409 let mut tag_decode_state = TagDecodeState::new();
410
411 if !context.base.filters.is_empty() {
413 let predicate_mask = context.base.compute_filter_mask_flat(
414 &record_batch,
415 skip_fields,
416 has_pk_prefilter,
417 &mut tag_decode_state,
418 )?;
419 let Some(mask) = predicate_mask else {
421 return Ok(None);
422 };
423 combined_filter = Some(BooleanArray::from(mask));
424 }
425
426 if let Some(sequence) = sequence {
428 let sequence_column =
429 record_batch.column(sequence_column_index(record_batch.num_columns()));
430 let sequence_filter = sequence
431 .filter(&sequence_column)
432 .context(ComputeArrowSnafu)?;
433 combined_filter = match combined_filter {
435 None => Some(sequence_filter),
436 Some(existing_filter) => {
437 let and_result = datatypes::arrow::compute::and(&existing_filter, &sequence_filter)
438 .context(ComputeArrowSnafu)?;
439 Some(and_result)
440 }
441 };
442 }
443
444 let Some(filter_array) = combined_filter else {
446 return Ok(Some(record_batch));
448 };
449 let select_count = filter_array.true_count();
450 if select_count == 0 {
451 return Ok(None);
452 }
453 if select_count == num_rows {
454 return Ok(Some(record_batch));
455 }
456 let filtered_batch =
457 datatypes::arrow::compute::filter_record_batch(&record_batch, &filter_array)
458 .context(ComputeArrowSnafu)?;
459
460 Ok(Some(filtered_batch))
461}
462
463#[cfg(test)]
464mod tests {
465 use std::sync::Arc;
466
467 use api::v1::SemanticType;
468 use datafusion_expr::{col, lit};
469 use datatypes::arrow::array::{
470 ArrayRef, BinaryArray, DictionaryArray, Int64Array, StringArray, UInt8Array, UInt32Array,
471 UInt64Array,
472 };
473 use datatypes::arrow::datatypes::{DataType, Field, Schema};
474 use datatypes::data_type::ConcreteDataType;
475 use datatypes::schema::ColumnSchema;
476 use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
477 use store_api::storage::RegionId;
478 use table::predicate::Predicate;
479
480 use super::*;
481 use crate::memtable::bulk::context::BulkIterContext;
482 use crate::test_util::sst_util::new_primary_key;
483
484 #[test]
485 fn test_bulk_part_batch_iter() {
486 let schema = Arc::new(Schema::new(vec![
488 Field::new("key1", DataType::Utf8, false),
489 Field::new("field1", DataType::Int64, false),
490 Field::new(
491 "timestamp",
492 DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
493 false,
494 ),
495 Field::new(
496 "__primary_key",
497 DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)),
498 false,
499 ),
500 Field::new("__sequence", DataType::UInt64, false),
501 Field::new("__op_type", DataType::UInt8, false),
502 ]));
503
504 let key1 = Arc::new(StringArray::from_iter_values(["key1", "key2", "key3"]));
506 let field1 = Arc::new(Int64Array::from(vec![11, 12, 13]));
507 let timestamp = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
508 vec![1000, 2000, 3000],
509 ));
510
511 use datatypes::arrow::array::{BinaryArray, DictionaryArray, UInt32Array};
513 let pk1 = new_primary_key(&["key1"]);
514 let pk2 = new_primary_key(&["key2"]);
515 let pk3 = new_primary_key(&["key3"]);
516 let values = Arc::new(BinaryArray::from_iter_values([
517 pk1.as_slice(),
518 pk2.as_slice(),
519 pk3.as_slice(),
520 ]));
521 let keys = UInt32Array::from(vec![0, 1, 2]);
522 let primary_key = Arc::new(DictionaryArray::new(keys, values));
523
524 let sequence = Arc::new(UInt64Array::from(vec![1, 2, 3]));
525 let op_type = Arc::new(UInt8Array::from(vec![1, 1, 1])); let record_batch = RecordBatch::try_new(
528 schema,
529 vec![
530 key1,
531 field1,
532 timestamp,
533 primary_key.clone(),
534 sequence,
535 op_type,
536 ],
537 )
538 .unwrap();
539
540 let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
542 builder
543 .push_column_metadata(ColumnMetadata {
544 column_schema: ColumnSchema::new(
545 "key1",
546 ConcreteDataType::string_datatype(),
547 false,
548 ),
549 semantic_type: SemanticType::Tag,
550 column_id: 0,
551 })
552 .push_column_metadata(ColumnMetadata {
553 column_schema: ColumnSchema::new(
554 "field1",
555 ConcreteDataType::int64_datatype(),
556 false,
557 ),
558 semantic_type: SemanticType::Field,
559 column_id: 1,
560 })
561 .push_column_metadata(ColumnMetadata {
562 column_schema: ColumnSchema::new(
563 "timestamp",
564 ConcreteDataType::timestamp_millisecond_datatype(),
565 false,
566 ),
567 semantic_type: SemanticType::Timestamp,
568 column_id: 2,
569 })
570 .primary_key(vec![0]);
571
572 let region_metadata = builder.build().unwrap();
573
574 let context = Arc::new(
576 BulkIterContext::new(
577 Arc::new(region_metadata.clone()),
578 None, None, false,
581 )
582 .unwrap(),
583 );
584 let iter =
586 BulkPartBatchIter::from_single(record_batch.clone(), context.clone(), None, 0, None);
587 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
588 assert_eq!(1, result.len());
589 assert_eq!(3, result[0].num_rows());
590 assert_eq!(6, result[0].num_columns(),);
591
592 let iter = BulkPartBatchIter::from_single(
594 record_batch.clone(),
595 context,
596 Some(SequenceRange::LtEq { max: 2 }),
597 0,
598 None,
599 );
600 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
601 assert_eq!(1, result.len());
602 let expect_sequence = Arc::new(UInt64Array::from(vec![1, 2])) as ArrayRef;
603 assert_eq!(
604 &expect_sequence,
605 result[0].column(result[0].num_columns() - 2)
606 );
607 assert_eq!(6, result[0].num_columns());
608
609 let context = Arc::new(
610 BulkIterContext::new(
611 Arc::new(region_metadata),
612 Some(&[0, 2]),
613 Some(Predicate::new(vec![col("key1").eq(lit("key2"))])),
614 false,
615 )
616 .unwrap(),
617 );
618 let iter =
620 BulkPartBatchIter::from_single(record_batch.clone(), context.clone(), None, 0, None);
621 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
622 assert_eq!(1, result.len());
623 assert_eq!(1, result[0].num_rows());
624 assert_eq!(5, result[0].num_columns());
625 let expect_sequence = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
626 assert_eq!(
627 &expect_sequence,
628 result[0].column(result[0].num_columns() - 2)
629 );
630 }
631
632 #[test]
633 fn test_bulk_part_batch_iter_multiple_batches() {
634 let schema = Arc::new(Schema::new(vec![
636 Field::new("key1", DataType::Utf8, false),
637 Field::new("field1", DataType::Int64, false),
638 Field::new(
639 "timestamp",
640 DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
641 false,
642 ),
643 Field::new(
644 "__primary_key",
645 DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)),
646 false,
647 ),
648 Field::new("__sequence", DataType::UInt64, false),
649 Field::new("__op_type", DataType::UInt8, false),
650 ]));
651
652 let pk1 = new_primary_key(&["key1"]);
654 let pk2 = new_primary_key(&["key2"]);
655 let key1_1 = Arc::new(StringArray::from_iter_values(["key1", "key2"]));
656 let field1_1 = Arc::new(Int64Array::from(vec![11, 12]));
657 let timestamp_1 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
658 vec![1000, 2000],
659 ));
660 let values_1 = Arc::new(BinaryArray::from_iter_values([
661 pk1.as_slice(),
662 pk2.as_slice(),
663 ]));
664 let keys_1 = UInt32Array::from(vec![0, 1]);
665 let primary_key_1 = Arc::new(DictionaryArray::new(keys_1, values_1));
666 let sequence_1 = Arc::new(UInt64Array::from(vec![1, 2]));
667 let op_type_1 = Arc::new(UInt8Array::from(vec![1, 1]));
668
669 let batch1 = RecordBatch::try_new(
670 schema.clone(),
671 vec![
672 key1_1,
673 field1_1,
674 timestamp_1,
675 primary_key_1,
676 sequence_1,
677 op_type_1,
678 ],
679 )
680 .unwrap();
681
682 let pk3 = new_primary_key(&["key3"]);
684 let pk4 = new_primary_key(&["key4"]);
685 let pk5 = new_primary_key(&["key5"]);
686 let key1_2 = Arc::new(StringArray::from_iter_values(["key3", "key4", "key5"]));
687 let field1_2 = Arc::new(Int64Array::from(vec![13, 14, 15]));
688 let timestamp_2 = Arc::new(datatypes::arrow::array::TimestampMillisecondArray::from(
689 vec![3000, 4000, 5000],
690 ));
691 let values_2 = Arc::new(BinaryArray::from_iter_values([
692 pk3.as_slice(),
693 pk4.as_slice(),
694 pk5.as_slice(),
695 ]));
696 let keys_2 = UInt32Array::from(vec![0, 1, 2]);
697 let primary_key_2 = Arc::new(DictionaryArray::new(keys_2, values_2));
698 let sequence_2 = Arc::new(UInt64Array::from(vec![3, 4, 5]));
699 let op_type_2 = Arc::new(UInt8Array::from(vec![1, 1, 1]));
700
701 let batch2 = RecordBatch::try_new(
702 schema.clone(),
703 vec![
704 key1_2,
705 field1_2,
706 timestamp_2,
707 primary_key_2,
708 sequence_2,
709 op_type_2,
710 ],
711 )
712 .unwrap();
713
714 let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
716 builder
717 .push_column_metadata(ColumnMetadata {
718 column_schema: ColumnSchema::new(
719 "key1",
720 ConcreteDataType::string_datatype(),
721 false,
722 ),
723 semantic_type: SemanticType::Tag,
724 column_id: 0,
725 })
726 .push_column_metadata(ColumnMetadata {
727 column_schema: ColumnSchema::new(
728 "field1",
729 ConcreteDataType::int64_datatype(),
730 false,
731 ),
732 semantic_type: SemanticType::Field,
733 column_id: 1,
734 })
735 .push_column_metadata(ColumnMetadata {
736 column_schema: ColumnSchema::new(
737 "timestamp",
738 ConcreteDataType::timestamp_millisecond_datatype(),
739 false,
740 ),
741 semantic_type: SemanticType::Timestamp,
742 column_id: 2,
743 })
744 .primary_key(vec![0]);
745
746 let region_metadata = builder.build().unwrap();
747
748 let context = Arc::new(
750 BulkIterContext::new(
751 Arc::new(region_metadata),
752 None, None, false,
755 )
756 .unwrap(),
757 );
758
759 let expect_batches = vec![batch1, batch2];
761 let iter = BulkPartBatchIter::new(expect_batches.clone(), context.clone(), None, 0, None);
762
763 let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect();
765 assert_eq!(expect_batches, result);
766 }
767}