1use std::collections::{BinaryHeap, HashMap, VecDeque};
18use std::fmt;
19use std::pin::Pin;
20use std::sync::{Arc, Mutex};
21use std::task::{Context, Poll};
22use std::time::{Duration, Instant};
23
24use async_stream::try_stream;
25use common_telemetry::tracing;
26use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time};
27use datatypes::arrow::record_batch::RecordBatch;
28use datatypes::timestamp::timestamp_array_to_primitive;
29use futures::Stream;
30use prometheus::IntGauge;
31use smallvec::SmallVec;
32use snafu::OptionExt;
33use store_api::storage::RegionId;
34
35use crate::error::{Result, UnexpectedSnafu};
36use crate::memtable::MemScanMetrics;
37use crate::metrics::{
38 IN_PROGRESS_SCAN, PRECISE_FILTER_ROWS_TOTAL, READ_BATCHES_RETURN, READ_ROW_GROUPS_TOTAL,
39 READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_RETURN, READ_STAGE_ELAPSED,
40};
41use crate::read::dedup::{DedupMetrics, DedupMetricsReport};
42use crate::read::merge::{MergeMetrics, MergeMetricsReport};
43use crate::read::pruner::PartitionPruner;
44use crate::read::range::{RangeMeta, RowGroupIndex};
45use crate::read::scan_region::StreamContext;
46use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source};
47use crate::sst::file::{FileTimeRange, RegionFileId};
48use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics;
49use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics;
50use crate::sst::index::inverted_index::applier::InvertedIndexApplyMetrics;
51use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
52use crate::sst::parquet::file_range::FileRange;
53use crate::sst::parquet::flat_format::time_index_column_index;
54use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderFilterMetrics, ReaderMetrics};
55use crate::sst::parquet::row_group::ParquetFetchMetrics;
56
57#[derive(Default, Clone)]
59pub struct FileScanMetrics {
60 pub num_ranges: usize,
62 pub num_rows: usize,
64 pub build_part_cost: Duration,
66 pub build_reader_cost: Duration,
68 pub scan_cost: Duration,
70}
71
72impl fmt::Debug for FileScanMetrics {
73 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74 write!(f, "{{\"build_part_cost\":\"{:?}\"", self.build_part_cost)?;
75
76 if self.num_ranges > 0 {
77 write!(f, ", \"num_ranges\":{}", self.num_ranges)?;
78 }
79 if self.num_rows > 0 {
80 write!(f, ", \"num_rows\":{}", self.num_rows)?;
81 }
82 if !self.build_reader_cost.is_zero() {
83 write!(
84 f,
85 ", \"build_reader_cost\":\"{:?}\"",
86 self.build_reader_cost
87 )?;
88 }
89 if !self.scan_cost.is_zero() {
90 write!(f, ", \"scan_cost\":\"{:?}\"", self.scan_cost)?;
91 }
92
93 write!(f, "}}")
94 }
95}
96
97impl FileScanMetrics {
98 pub(crate) fn merge_from(&mut self, other: &FileScanMetrics) {
100 self.num_ranges += other.num_ranges;
101 self.num_rows += other.num_rows;
102 self.build_part_cost += other.build_part_cost;
103 self.build_reader_cost += other.build_reader_cost;
104 self.scan_cost += other.scan_cost;
105 }
106}
107
108#[derive(Default)]
110pub(crate) struct ScanMetricsSet {
111 prepare_scan_cost: Duration,
113 build_reader_cost: Duration,
115 scan_cost: Duration,
117 yield_cost: Duration,
119 convert_cost: Option<Time>,
121 total_cost: Duration,
123 num_rows: usize,
125 num_batches: usize,
127 num_mem_ranges: usize,
129 num_file_ranges: usize,
131
132 mem_scan_cost: Duration,
135 mem_rows: usize,
137 mem_batches: usize,
139 mem_series: usize,
141
142 build_parts_cost: Duration,
145 sst_scan_cost: Duration,
147 rg_total: usize,
149 rg_fulltext_filtered: usize,
151 rg_inverted_filtered: usize,
153 rg_minmax_filtered: usize,
155 rg_bloom_filtered: usize,
157 rg_vector_filtered: usize,
159 rows_before_filter: usize,
161 rows_fulltext_filtered: usize,
163 rows_inverted_filtered: usize,
165 rows_bloom_filtered: usize,
167 rows_vector_filtered: usize,
169 rows_vector_selected: usize,
171 rows_precise_filtered: usize,
173 fulltext_index_cache_hit: usize,
175 fulltext_index_cache_miss: usize,
177 inverted_index_cache_hit: usize,
179 inverted_index_cache_miss: usize,
181 bloom_filter_cache_hit: usize,
183 bloom_filter_cache_miss: usize,
185 minmax_cache_hit: usize,
187 minmax_cache_miss: usize,
189 pruner_cache_hit: usize,
191 pruner_cache_miss: usize,
193 pruner_prune_cost: Duration,
195 num_sst_record_batches: usize,
197 num_sst_batches: usize,
199 num_sst_rows: usize,
201
202 first_poll: Duration,
204
205 num_series_send_timeout: usize,
207 num_series_send_full: usize,
209 num_distributor_rows: usize,
211 num_distributor_batches: usize,
213 distributor_scan_cost: Duration,
215 distributor_yield_cost: Duration,
217 distributor_divider_cost: Duration,
219
220 merge_metrics: MergeMetrics,
222 dedup_metrics: DedupMetrics,
224
225 stream_eof: bool,
227
228 inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
231 bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
233 fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
235 fetch_metrics: Option<ParquetFetchMetrics>,
237 metadata_cache_metrics: Option<MetadataCacheMetrics>,
239 per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
241
242 build_ranges_mem_size: isize,
244 build_ranges_peak_mem_size: isize,
246 num_range_builders: isize,
248 num_peak_range_builders: isize,
250}
251
252struct CompareCostReverse<'a> {
255 total_cost: Duration,
256 file_id: RegionFileId,
257 metrics: &'a FileScanMetrics,
258}
259
260impl Ord for CompareCostReverse<'_> {
261 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
262 other.total_cost.cmp(&self.total_cost)
264 }
265}
266
267impl PartialOrd for CompareCostReverse<'_> {
268 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
269 Some(self.cmp(other))
270 }
271}
272
273impl Eq for CompareCostReverse<'_> {}
274
275impl PartialEq for CompareCostReverse<'_> {
276 fn eq(&self, other: &Self) -> bool {
277 self.total_cost == other.total_cost
278 }
279}
280
281impl fmt::Debug for ScanMetricsSet {
282 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283 let ScanMetricsSet {
284 prepare_scan_cost,
285 build_reader_cost,
286 scan_cost,
287 yield_cost,
288 convert_cost,
289 total_cost,
290 num_rows,
291 num_batches,
292 num_mem_ranges,
293 num_file_ranges,
294 build_parts_cost,
295 sst_scan_cost,
296 rg_total,
297 rg_fulltext_filtered,
298 rg_inverted_filtered,
299 rg_minmax_filtered,
300 rg_bloom_filtered,
301 rg_vector_filtered,
302 rows_before_filter,
303 rows_fulltext_filtered,
304 rows_inverted_filtered,
305 rows_bloom_filtered,
306 rows_vector_filtered,
307 rows_vector_selected,
308 rows_precise_filtered,
309 fulltext_index_cache_hit,
310 fulltext_index_cache_miss,
311 inverted_index_cache_hit,
312 inverted_index_cache_miss,
313 bloom_filter_cache_hit,
314 bloom_filter_cache_miss,
315 minmax_cache_hit,
316 minmax_cache_miss,
317 pruner_cache_hit,
318 pruner_cache_miss,
319 pruner_prune_cost,
320 num_sst_record_batches,
321 num_sst_batches,
322 num_sst_rows,
323 first_poll,
324 num_series_send_timeout,
325 num_series_send_full,
326 num_distributor_rows,
327 num_distributor_batches,
328 distributor_scan_cost,
329 distributor_yield_cost,
330 distributor_divider_cost,
331 merge_metrics,
332 dedup_metrics,
333 stream_eof,
334 mem_scan_cost,
335 mem_rows,
336 mem_batches,
337 mem_series,
338 inverted_index_apply_metrics,
339 bloom_filter_apply_metrics,
340 fulltext_index_apply_metrics,
341 fetch_metrics,
342 metadata_cache_metrics,
343 per_file_metrics,
344 build_ranges_mem_size: _,
345 build_ranges_peak_mem_size,
346 num_range_builders: _,
347 num_peak_range_builders,
348 } = self;
349
350 write!(
352 f,
353 "{{\"prepare_scan_cost\":\"{prepare_scan_cost:?}\", \
354 \"build_reader_cost\":\"{build_reader_cost:?}\", \
355 \"scan_cost\":\"{scan_cost:?}\", \
356 \"yield_cost\":\"{yield_cost:?}\", \
357 \"total_cost\":\"{total_cost:?}\", \
358 \"num_rows\":{num_rows}, \
359 \"num_batches\":{num_batches}, \
360 \"num_mem_ranges\":{num_mem_ranges}, \
361 \"num_file_ranges\":{num_file_ranges}, \
362 \"build_parts_cost\":\"{build_parts_cost:?}\", \
363 \"sst_scan_cost\":\"{sst_scan_cost:?}\", \
364 \"rg_total\":{rg_total}, \
365 \"rows_before_filter\":{rows_before_filter}, \
366 \"num_sst_record_batches\":{num_sst_record_batches}, \
367 \"num_sst_batches\":{num_sst_batches}, \
368 \"num_sst_rows\":{num_sst_rows}, \
369 \"first_poll\":\"{first_poll:?}\""
370 )?;
371
372 if let Some(time) = convert_cost {
374 let duration = Duration::from_nanos(time.value() as u64);
375 write!(f, ", \"convert_cost\":\"{duration:?}\"")?;
376 }
377
378 if *rg_fulltext_filtered > 0 {
380 write!(f, ", \"rg_fulltext_filtered\":{rg_fulltext_filtered}")?;
381 }
382 if *rg_inverted_filtered > 0 {
383 write!(f, ", \"rg_inverted_filtered\":{rg_inverted_filtered}")?;
384 }
385 if *rg_minmax_filtered > 0 {
386 write!(f, ", \"rg_minmax_filtered\":{rg_minmax_filtered}")?;
387 }
388 if *rg_bloom_filtered > 0 {
389 write!(f, ", \"rg_bloom_filtered\":{rg_bloom_filtered}")?;
390 }
391 if *rg_vector_filtered > 0 {
392 write!(f, ", \"rg_vector_filtered\":{rg_vector_filtered}")?;
393 }
394 if *rows_fulltext_filtered > 0 {
395 write!(f, ", \"rows_fulltext_filtered\":{rows_fulltext_filtered}")?;
396 }
397 if *rows_inverted_filtered > 0 {
398 write!(f, ", \"rows_inverted_filtered\":{rows_inverted_filtered}")?;
399 }
400 if *rows_bloom_filtered > 0 {
401 write!(f, ", \"rows_bloom_filtered\":{rows_bloom_filtered}")?;
402 }
403 if *rows_vector_filtered > 0 {
404 write!(f, ", \"rows_vector_filtered\":{rows_vector_filtered}")?;
405 }
406 if *rows_vector_selected > 0 {
407 write!(f, ", \"rows_vector_selected\":{rows_vector_selected}")?;
408 }
409 if *rows_precise_filtered > 0 {
410 write!(f, ", \"rows_precise_filtered\":{rows_precise_filtered}")?;
411 }
412 if *fulltext_index_cache_hit > 0 {
413 write!(
414 f,
415 ", \"fulltext_index_cache_hit\":{fulltext_index_cache_hit}"
416 )?;
417 }
418 if *fulltext_index_cache_miss > 0 {
419 write!(
420 f,
421 ", \"fulltext_index_cache_miss\":{fulltext_index_cache_miss}"
422 )?;
423 }
424 if *inverted_index_cache_hit > 0 {
425 write!(
426 f,
427 ", \"inverted_index_cache_hit\":{inverted_index_cache_hit}"
428 )?;
429 }
430 if *inverted_index_cache_miss > 0 {
431 write!(
432 f,
433 ", \"inverted_index_cache_miss\":{inverted_index_cache_miss}"
434 )?;
435 }
436 if *bloom_filter_cache_hit > 0 {
437 write!(f, ", \"bloom_filter_cache_hit\":{bloom_filter_cache_hit}")?;
438 }
439 if *bloom_filter_cache_miss > 0 {
440 write!(f, ", \"bloom_filter_cache_miss\":{bloom_filter_cache_miss}")?;
441 }
442 if *minmax_cache_hit > 0 {
443 write!(f, ", \"minmax_cache_hit\":{minmax_cache_hit}")?;
444 }
445 if *minmax_cache_miss > 0 {
446 write!(f, ", \"minmax_cache_miss\":{minmax_cache_miss}")?;
447 }
448 if *pruner_cache_hit > 0 {
449 write!(f, ", \"pruner_cache_hit\":{pruner_cache_hit}")?;
450 }
451 if *pruner_cache_miss > 0 {
452 write!(f, ", \"pruner_cache_miss\":{pruner_cache_miss}")?;
453 }
454 if !pruner_prune_cost.is_zero() {
455 write!(f, ", \"pruner_prune_cost\":\"{pruner_prune_cost:?}\"")?;
456 }
457
458 if *num_series_send_timeout > 0 {
460 write!(f, ", \"num_series_send_timeout\":{num_series_send_timeout}")?;
461 }
462 if *num_series_send_full > 0 {
463 write!(f, ", \"num_series_send_full\":{num_series_send_full}")?;
464 }
465 if *num_distributor_rows > 0 {
466 write!(f, ", \"num_distributor_rows\":{num_distributor_rows}")?;
467 }
468 if *num_distributor_batches > 0 {
469 write!(f, ", \"num_distributor_batches\":{num_distributor_batches}")?;
470 }
471 if !distributor_scan_cost.is_zero() {
472 write!(
473 f,
474 ", \"distributor_scan_cost\":\"{distributor_scan_cost:?}\""
475 )?;
476 }
477 if !distributor_yield_cost.is_zero() {
478 write!(
479 f,
480 ", \"distributor_yield_cost\":\"{distributor_yield_cost:?}\""
481 )?;
482 }
483 if !distributor_divider_cost.is_zero() {
484 write!(
485 f,
486 ", \"distributor_divider_cost\":\"{distributor_divider_cost:?}\""
487 )?;
488 }
489
490 if *mem_rows > 0 {
492 write!(f, ", \"mem_rows\":{mem_rows}")?;
493 }
494 if *mem_batches > 0 {
495 write!(f, ", \"mem_batches\":{mem_batches}")?;
496 }
497 if *mem_series > 0 {
498 write!(f, ", \"mem_series\":{mem_series}")?;
499 }
500 if !mem_scan_cost.is_zero() {
501 write!(f, ", \"mem_scan_cost\":\"{mem_scan_cost:?}\"")?;
502 }
503
504 if let Some(metrics) = inverted_index_apply_metrics
506 && !metrics.is_empty()
507 {
508 write!(f, ", \"inverted_index_apply_metrics\":{:?}", metrics)?;
509 }
510 if let Some(metrics) = bloom_filter_apply_metrics
511 && !metrics.is_empty()
512 {
513 write!(f, ", \"bloom_filter_apply_metrics\":{:?}", metrics)?;
514 }
515 if let Some(metrics) = fulltext_index_apply_metrics
516 && !metrics.is_empty()
517 {
518 write!(f, ", \"fulltext_index_apply_metrics\":{:?}", metrics)?;
519 }
520 if let Some(metrics) = fetch_metrics
521 && !metrics.is_empty()
522 {
523 write!(f, ", \"fetch_metrics\":{:?}", metrics)?;
524 }
525 if let Some(metrics) = metadata_cache_metrics
526 && !metrics.is_empty()
527 {
528 write!(f, ", \"metadata_cache_metrics\":{:?}", metrics)?;
529 }
530
531 if !merge_metrics.scan_cost.is_zero() {
533 write!(f, ", \"merge_metrics\":{:?}", merge_metrics)?;
534 }
535
536 if !dedup_metrics.dedup_cost.is_zero() {
538 write!(f, ", \"dedup_metrics\":{:?}", dedup_metrics)?;
539 }
540
541 if let Some(file_metrics) = per_file_metrics
543 && !file_metrics.is_empty()
544 {
545 let mut heap = BinaryHeap::new();
547 for (file_id, metrics) in file_metrics.iter() {
548 let total_cost =
549 metrics.build_part_cost + metrics.build_reader_cost + metrics.scan_cost;
550
551 if total_cost.is_zero() && metrics.num_ranges == 0 {
554 continue;
555 }
556
557 if heap.len() < 10 {
558 heap.push(CompareCostReverse {
560 total_cost,
561 file_id: *file_id,
562 metrics,
563 });
564 } else if let Some(min_entry) = heap.peek() {
565 if total_cost > min_entry.total_cost {
567 heap.pop();
568 heap.push(CompareCostReverse {
569 total_cost,
570 file_id: *file_id,
571 metrics,
572 });
573 }
574 }
575 }
576
577 let top_files = heap.into_sorted_vec();
578 write!(f, ", \"top_file_metrics\": {{")?;
579 for (i, item) in top_files.iter().enumerate() {
580 let CompareCostReverse {
581 total_cost: _,
582 file_id,
583 metrics,
584 } = item;
585 if i > 0 {
586 write!(f, ", ")?;
587 }
588 write!(f, "\"{}\": {:?}", file_id, metrics)?;
589 }
590 write!(f, "}}")?;
591 }
592
593 write!(
594 f,
595 ", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \
596 \"num_peak_range_builders\":{num_peak_range_builders}, \
597 \"stream_eof\":{stream_eof}}}"
598 )
599 }
600}
601impl ScanMetricsSet {
602 fn with_prepare_scan_cost(mut self, cost: Duration) -> Self {
604 self.prepare_scan_cost += cost;
605 self
606 }
607
608 fn with_convert_cost(mut self, time: Time) -> Self {
610 self.convert_cost = Some(time);
611 self
612 }
613
614 fn merge_scanner_metrics(&mut self, other: &ScannerMetrics) {
616 let ScannerMetrics {
617 scan_cost,
618 yield_cost,
619 num_batches,
620 num_rows,
621 } = other;
622
623 self.scan_cost += *scan_cost;
624 self.yield_cost += *yield_cost;
625 self.num_rows += *num_rows;
626 self.num_batches += *num_batches;
627 }
628
629 fn merge_reader_metrics(&mut self, other: &ReaderMetrics) {
631 let ReaderMetrics {
632 build_cost,
633 filter_metrics:
634 ReaderFilterMetrics {
635 rg_total,
636 rg_fulltext_filtered,
637 rg_inverted_filtered,
638 rg_minmax_filtered,
639 rg_bloom_filtered,
640 rg_vector_filtered,
641 rows_total,
642 rows_fulltext_filtered,
643 rows_inverted_filtered,
644 rows_bloom_filtered,
645 rows_vector_filtered,
646 rows_vector_selected,
647 rows_precise_filtered,
648 fulltext_index_cache_hit,
649 fulltext_index_cache_miss,
650 inverted_index_cache_hit,
651 inverted_index_cache_miss,
652 bloom_filter_cache_hit,
653 bloom_filter_cache_miss,
654 minmax_cache_hit,
655 minmax_cache_miss,
656 pruner_cache_hit,
657 pruner_cache_miss,
658 pruner_prune_cost,
659 inverted_index_apply_metrics,
660 bloom_filter_apply_metrics,
661 fulltext_index_apply_metrics,
662 },
663 num_record_batches,
664 num_batches,
665 num_rows,
666 scan_cost,
667 metadata_cache_metrics,
668 fetch_metrics,
669 metadata_mem_size,
670 num_range_builders,
671 } = other;
672
673 self.build_parts_cost += *build_cost;
674 self.sst_scan_cost += *scan_cost;
675
676 self.rg_total += *rg_total;
677 self.rg_fulltext_filtered += *rg_fulltext_filtered;
678 self.rg_inverted_filtered += *rg_inverted_filtered;
679 self.rg_minmax_filtered += *rg_minmax_filtered;
680 self.rg_bloom_filtered += *rg_bloom_filtered;
681 self.rg_vector_filtered += *rg_vector_filtered;
682
683 self.rows_before_filter += *rows_total;
684 self.rows_fulltext_filtered += *rows_fulltext_filtered;
685 self.rows_inverted_filtered += *rows_inverted_filtered;
686 self.rows_bloom_filtered += *rows_bloom_filtered;
687 self.rows_vector_filtered += *rows_vector_filtered;
688 self.rows_vector_selected += *rows_vector_selected;
689 self.rows_precise_filtered += *rows_precise_filtered;
690
691 self.fulltext_index_cache_hit += *fulltext_index_cache_hit;
692 self.fulltext_index_cache_miss += *fulltext_index_cache_miss;
693 self.inverted_index_cache_hit += *inverted_index_cache_hit;
694 self.inverted_index_cache_miss += *inverted_index_cache_miss;
695 self.bloom_filter_cache_hit += *bloom_filter_cache_hit;
696 self.bloom_filter_cache_miss += *bloom_filter_cache_miss;
697 self.minmax_cache_hit += *minmax_cache_hit;
698 self.minmax_cache_miss += *minmax_cache_miss;
699 self.pruner_cache_hit += *pruner_cache_hit;
700 self.pruner_cache_miss += *pruner_cache_miss;
701 self.pruner_prune_cost += *pruner_prune_cost;
702
703 self.num_sst_record_batches += *num_record_batches;
704 self.num_sst_batches += *num_batches;
705 self.num_sst_rows += *num_rows;
706
707 if let Some(metrics) = inverted_index_apply_metrics {
709 self.inverted_index_apply_metrics
710 .get_or_insert_with(InvertedIndexApplyMetrics::default)
711 .merge_from(metrics);
712 }
713 if let Some(metrics) = bloom_filter_apply_metrics {
714 self.bloom_filter_apply_metrics
715 .get_or_insert_with(BloomFilterIndexApplyMetrics::default)
716 .merge_from(metrics);
717 }
718 if let Some(metrics) = fulltext_index_apply_metrics {
719 self.fulltext_index_apply_metrics
720 .get_or_insert_with(FulltextIndexApplyMetrics::default)
721 .merge_from(metrics);
722 }
723 if let Some(metrics) = fetch_metrics {
724 self.fetch_metrics
725 .get_or_insert_with(ParquetFetchMetrics::default)
726 .merge_from(metrics);
727 }
728 self.metadata_cache_metrics
729 .get_or_insert_with(MetadataCacheMetrics::default)
730 .merge_from(metadata_cache_metrics);
731
732 self.build_ranges_mem_size += *metadata_mem_size;
734 if self.build_ranges_mem_size > self.build_ranges_peak_mem_size {
735 self.build_ranges_peak_mem_size = self.build_ranges_mem_size;
736 }
737
738 self.num_range_builders += *num_range_builders;
740 if self.num_range_builders > self.num_peak_range_builders {
741 self.num_peak_range_builders = self.num_range_builders;
742 }
743 }
744
745 fn merge_per_file_metrics(&mut self, other: &HashMap<RegionFileId, FileScanMetrics>) {
747 let self_file_metrics = self.per_file_metrics.get_or_insert_with(HashMap::new);
748 for (file_id, metrics) in other {
749 self_file_metrics
750 .entry(*file_id)
751 .or_default()
752 .merge_from(metrics);
753 }
754 }
755
756 fn set_distributor_metrics(&mut self, distributor_metrics: &SeriesDistributorMetrics) {
758 let SeriesDistributorMetrics {
759 num_series_send_timeout,
760 num_series_send_full,
761 num_rows,
762 num_batches,
763 scan_cost,
764 yield_cost,
765 divider_cost,
766 } = distributor_metrics;
767
768 self.num_series_send_timeout += *num_series_send_timeout;
769 self.num_series_send_full += *num_series_send_full;
770 self.num_distributor_rows += *num_rows;
771 self.num_distributor_batches += *num_batches;
772 self.distributor_scan_cost += *scan_cost;
773 self.distributor_yield_cost += *yield_cost;
774 self.distributor_divider_cost += *divider_cost;
775 }
776
777 fn observe_metrics(&self) {
779 READ_STAGE_ELAPSED
780 .with_label_values(&["prepare_scan"])
781 .observe(self.prepare_scan_cost.as_secs_f64());
782 READ_STAGE_ELAPSED
783 .with_label_values(&["build_reader"])
784 .observe(self.build_reader_cost.as_secs_f64());
785 READ_STAGE_ELAPSED
786 .with_label_values(&["scan"])
787 .observe(self.scan_cost.as_secs_f64());
788 READ_STAGE_ELAPSED
789 .with_label_values(&["yield"])
790 .observe(self.yield_cost.as_secs_f64());
791 if let Some(time) = &self.convert_cost {
792 READ_STAGE_ELAPSED
793 .with_label_values(&["convert"])
794 .observe(Duration::from_nanos(time.value() as u64).as_secs_f64());
795 }
796 READ_STAGE_ELAPSED
797 .with_label_values(&["total"])
798 .observe(self.total_cost.as_secs_f64());
799 READ_ROWS_RETURN.observe(self.num_rows as f64);
800 READ_BATCHES_RETURN.observe(self.num_batches as f64);
801
802 READ_STAGE_ELAPSED
803 .with_label_values(&["build_parts"])
804 .observe(self.build_parts_cost.as_secs_f64());
805
806 READ_ROW_GROUPS_TOTAL
807 .with_label_values(&["before_filtering"])
808 .inc_by(self.rg_total as u64);
809 READ_ROW_GROUPS_TOTAL
810 .with_label_values(&["fulltext_index_filtered"])
811 .inc_by(self.rg_fulltext_filtered as u64);
812 READ_ROW_GROUPS_TOTAL
813 .with_label_values(&["inverted_index_filtered"])
814 .inc_by(self.rg_inverted_filtered as u64);
815 READ_ROW_GROUPS_TOTAL
816 .with_label_values(&["minmax_index_filtered"])
817 .inc_by(self.rg_minmax_filtered as u64);
818 READ_ROW_GROUPS_TOTAL
819 .with_label_values(&["bloom_filter_index_filtered"])
820 .inc_by(self.rg_bloom_filtered as u64);
821 #[cfg(feature = "vector_index")]
822 READ_ROW_GROUPS_TOTAL
823 .with_label_values(&["vector_index_filtered"])
824 .inc_by(self.rg_vector_filtered as u64);
825
826 PRECISE_FILTER_ROWS_TOTAL
827 .with_label_values(&["parquet"])
828 .inc_by(self.rows_precise_filtered as u64);
829 READ_ROWS_IN_ROW_GROUP_TOTAL
830 .with_label_values(&["before_filtering"])
831 .inc_by(self.rows_before_filter as u64);
832 READ_ROWS_IN_ROW_GROUP_TOTAL
833 .with_label_values(&["fulltext_index_filtered"])
834 .inc_by(self.rows_fulltext_filtered as u64);
835 READ_ROWS_IN_ROW_GROUP_TOTAL
836 .with_label_values(&["inverted_index_filtered"])
837 .inc_by(self.rows_inverted_filtered as u64);
838 READ_ROWS_IN_ROW_GROUP_TOTAL
839 .with_label_values(&["bloom_filter_index_filtered"])
840 .inc_by(self.rows_bloom_filtered as u64);
841 #[cfg(feature = "vector_index")]
842 READ_ROWS_IN_ROW_GROUP_TOTAL
843 .with_label_values(&["vector_index_filtered"])
844 .inc_by(self.rows_vector_filtered as u64);
845 }
846}
847
848struct PartitionMetricsInner {
849 region_id: RegionId,
850 partition: usize,
852 scanner_type: &'static str,
854 query_start: Instant,
856 explain_verbose: bool,
858 metrics: Mutex<ScanMetricsSet>,
860 in_progress_scan: IntGauge,
861
862 build_parts_cost: Time,
865 build_reader_cost: Time,
867 scan_cost: Time,
869 yield_cost: Time,
871 convert_cost: Time,
873 elapsed_compute: Time,
875}
876
877impl PartitionMetricsInner {
878 fn on_finish(&self, stream_eof: bool) {
879 let mut metrics = self.metrics.lock().unwrap();
880 if metrics.total_cost.is_zero() {
881 metrics.total_cost = self.query_start.elapsed();
882 }
883 if !metrics.stream_eof {
884 metrics.stream_eof = stream_eof;
885 }
886 }
887}
888
889impl MergeMetricsReport for PartitionMetricsInner {
890 fn report(&self, metrics: &mut MergeMetrics) {
891 let mut scan_metrics = self.metrics.lock().unwrap();
892 scan_metrics.merge_metrics.merge(metrics);
894
895 *metrics = MergeMetrics::default();
897 }
898}
899
900impl DedupMetricsReport for PartitionMetricsInner {
901 fn report(&self, metrics: &mut DedupMetrics) {
902 let mut scan_metrics = self.metrics.lock().unwrap();
903 scan_metrics.dedup_metrics.merge(metrics);
905
906 *metrics = DedupMetrics::default();
908 }
909}
910
911impl Drop for PartitionMetricsInner {
912 fn drop(&mut self) {
913 self.on_finish(false);
914 let metrics = self.metrics.lock().unwrap();
915 metrics.observe_metrics();
916 self.in_progress_scan.dec();
917
918 if self.explain_verbose {
919 common_telemetry::info!(
920 "{} finished, region_id: {}, partition: {}, scan_metrics: {:?}",
921 self.scanner_type,
922 self.region_id,
923 self.partition,
924 metrics,
925 );
926 } else {
927 common_telemetry::debug!(
928 "{} finished, region_id: {}, partition: {}, scan_metrics: {:?}",
929 self.scanner_type,
930 self.region_id,
931 self.partition,
932 metrics,
933 );
934 }
935 }
936}
937
938#[derive(Default)]
940pub(crate) struct PartitionMetricsList(Mutex<Vec<Option<PartitionMetrics>>>);
941
942impl PartitionMetricsList {
943 pub(crate) fn set(&self, partition: usize, metrics: PartitionMetrics) {
945 let mut list = self.0.lock().unwrap();
946 if list.len() <= partition {
947 list.resize(partition + 1, None);
948 }
949 list[partition] = Some(metrics);
950 }
951
952 pub(crate) fn format_verbose_metrics(&self, f: &mut fmt::Formatter) -> fmt::Result {
954 let list = self.0.lock().unwrap();
955 write!(f, ", \"metrics_per_partition\": ")?;
956 f.debug_list()
957 .entries(list.iter().filter_map(|p| p.as_ref()))
958 .finish()?;
959 write!(f, "}}")
960 }
961}
962
963#[derive(Clone)]
965pub struct PartitionMetrics(Arc<PartitionMetricsInner>);
966
967impl PartitionMetrics {
968 pub(crate) fn new(
969 region_id: RegionId,
970 partition: usize,
971 scanner_type: &'static str,
972 query_start: Instant,
973 explain_verbose: bool,
974 metrics_set: &ExecutionPlanMetricsSet,
975 ) -> Self {
976 let partition_str = partition.to_string();
977 let in_progress_scan = IN_PROGRESS_SCAN.with_label_values(&[scanner_type, &partition_str]);
978 in_progress_scan.inc();
979 let convert_cost = MetricBuilder::new(metrics_set).subset_time("convert_cost", partition);
980 let metrics = ScanMetricsSet::default()
981 .with_prepare_scan_cost(query_start.elapsed())
982 .with_convert_cost(convert_cost.clone());
983 let inner = PartitionMetricsInner {
984 region_id,
985 partition,
986 scanner_type,
987 query_start,
988 explain_verbose,
989 metrics: Mutex::new(metrics),
990 in_progress_scan,
991 build_parts_cost: MetricBuilder::new(metrics_set)
992 .subset_time("build_parts_cost", partition),
993 build_reader_cost: MetricBuilder::new(metrics_set)
994 .subset_time("build_reader_cost", partition),
995 scan_cost: MetricBuilder::new(metrics_set).subset_time("scan_cost", partition),
996 yield_cost: MetricBuilder::new(metrics_set).subset_time("yield_cost", partition),
997 convert_cost,
998 elapsed_compute: MetricBuilder::new(metrics_set).elapsed_compute(partition),
999 };
1000 Self(Arc::new(inner))
1001 }
1002
1003 pub(crate) fn on_first_poll(&self) {
1004 let mut metrics = self.0.metrics.lock().unwrap();
1005 metrics.first_poll = self.0.query_start.elapsed();
1006 }
1007
1008 pub(crate) fn inc_num_mem_ranges(&self, num: usize) {
1009 let mut metrics = self.0.metrics.lock().unwrap();
1010 metrics.num_mem_ranges += num;
1011 }
1012
1013 pub fn inc_num_file_ranges(&self, num: usize) {
1014 let mut metrics = self.0.metrics.lock().unwrap();
1015 metrics.num_file_ranges += num;
1016 }
1017
1018 fn record_elapsed_compute(&self, duration: Duration) {
1019 if duration.is_zero() {
1020 return;
1021 }
1022 self.0.elapsed_compute.add_duration(duration);
1023 }
1024
1025 pub(crate) fn inc_build_reader_cost(&self, cost: Duration) {
1027 self.0.build_reader_cost.add_duration(cost);
1028
1029 let mut metrics = self.0.metrics.lock().unwrap();
1030 metrics.build_reader_cost += cost;
1031 }
1032
1033 pub(crate) fn inc_convert_batch_cost(&self, cost: Duration) {
1034 self.0.convert_cost.add_duration(cost);
1035 self.record_elapsed_compute(cost);
1036 }
1037
1038 pub(crate) fn report_mem_scan_metrics(&self, data: &crate::memtable::MemScanMetricsData) {
1040 let mut metrics = self.0.metrics.lock().unwrap();
1041 metrics.mem_scan_cost += data.scan_cost;
1042 metrics.mem_rows += data.num_rows;
1043 metrics.mem_batches += data.num_batches;
1044 metrics.mem_series += data.total_series;
1045 }
1046
1047 pub(crate) fn merge_metrics(&self, metrics: &ScannerMetrics) {
1049 self.0.scan_cost.add_duration(metrics.scan_cost);
1050 self.record_elapsed_compute(metrics.scan_cost);
1051 self.0.yield_cost.add_duration(metrics.yield_cost);
1052 self.record_elapsed_compute(metrics.yield_cost);
1053
1054 let mut metrics_set = self.0.metrics.lock().unwrap();
1055 metrics_set.merge_scanner_metrics(metrics);
1056 }
1057
1058 pub fn merge_reader_metrics(
1060 &self,
1061 metrics: &ReaderMetrics,
1062 per_file_metrics: Option<&HashMap<RegionFileId, FileScanMetrics>>,
1063 ) {
1064 self.0.build_parts_cost.add_duration(metrics.build_cost);
1065
1066 let mut metrics_set = self.0.metrics.lock().unwrap();
1067 metrics_set.merge_reader_metrics(metrics);
1068
1069 if let Some(file_metrics) = per_file_metrics {
1071 metrics_set.merge_per_file_metrics(file_metrics);
1072 }
1073 }
1074
1075 pub(crate) fn on_finish(&self) {
1077 self.0.on_finish(true);
1078 }
1079
1080 pub(crate) fn set_distributor_metrics(&self, metrics: &SeriesDistributorMetrics) {
1082 let mut metrics_set = self.0.metrics.lock().unwrap();
1083 metrics_set.set_distributor_metrics(metrics);
1084 }
1085
1086 pub(crate) fn explain_verbose(&self) -> bool {
1088 self.0.explain_verbose
1089 }
1090
1091 pub(crate) fn merge_metrics_reporter(&self) -> Arc<dyn MergeMetricsReport> {
1093 self.0.clone()
1094 }
1095
1096 pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
1098 self.0.clone()
1099 }
1100}
1101
1102impl fmt::Debug for PartitionMetrics {
1103 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1104 let metrics = self.0.metrics.lock().unwrap();
1105 write!(
1106 f,
1107 r#"{{"partition":{}, "metrics":{:?}}}"#,
1108 self.0.partition, metrics
1109 )
1110 }
1111}
1112
1113#[derive(Default)]
1115pub(crate) struct SeriesDistributorMetrics {
1116 pub(crate) num_series_send_timeout: usize,
1118 pub(crate) num_series_send_full: usize,
1120 pub(crate) num_rows: usize,
1122 pub(crate) num_batches: usize,
1124 pub(crate) scan_cost: Duration,
1126 pub(crate) yield_cost: Duration,
1128 pub(crate) divider_cost: Duration,
1130}
1131
1132#[tracing::instrument(
1134 skip_all,
1135 fields(
1136 region_id = %stream_ctx.input.region_metadata().region_id,
1137 file_or_mem_index = %index.index,
1138 row_group_index = %index.row_group_index,
1139 source = "mem"
1140 )
1141)]
1142pub(crate) fn scan_mem_ranges(
1143 stream_ctx: Arc<StreamContext>,
1144 part_metrics: PartitionMetrics,
1145 index: RowGroupIndex,
1146 time_range: FileTimeRange,
1147) -> impl Stream<Item = Result<Batch>> {
1148 try_stream! {
1149 let ranges = stream_ctx.input.build_mem_ranges(index);
1150 part_metrics.inc_num_mem_ranges(ranges.len());
1151 for range in ranges {
1152 let build_reader_start = Instant::now();
1153 let mem_scan_metrics = Some(MemScanMetrics::default());
1154 let iter = range.build_prune_iter(time_range, mem_scan_metrics.clone())?;
1155 part_metrics.inc_build_reader_cost(build_reader_start.elapsed());
1156
1157 let mut source = Source::Iter(iter);
1158 while let Some(batch) = source.next_batch().await? {
1159 yield batch;
1160 }
1161
1162 if let Some(ref metrics) = mem_scan_metrics {
1164 let data = metrics.data();
1165 part_metrics.report_mem_scan_metrics(&data);
1166 }
1167 }
1168 }
1169}
1170
1171#[tracing::instrument(
1173 skip_all,
1174 fields(
1175 region_id = %stream_ctx.input.region_metadata().region_id,
1176 row_group_index = %index.index,
1177 source = "mem_flat"
1178 )
1179)]
1180pub(crate) fn scan_flat_mem_ranges(
1181 stream_ctx: Arc<StreamContext>,
1182 part_metrics: PartitionMetrics,
1183 index: RowGroupIndex,
1184 time_range: FileTimeRange,
1185) -> impl Stream<Item = Result<RecordBatch>> {
1186 try_stream! {
1187 let ranges = stream_ctx.input.build_mem_ranges(index);
1188 part_metrics.inc_num_mem_ranges(ranges.len());
1189 for range in ranges {
1190 let build_reader_start = Instant::now();
1191 let mem_scan_metrics = Some(MemScanMetrics::default());
1192 let mut iter = range.build_record_batch_iter(Some(time_range), mem_scan_metrics.clone())?;
1193 part_metrics.inc_build_reader_cost(build_reader_start.elapsed());
1194
1195 while let Some(record_batch) = iter.next().transpose()? {
1196 yield record_batch;
1197 }
1198
1199 if let Some(ref metrics) = mem_scan_metrics {
1201 let data = metrics.data();
1202 part_metrics.report_mem_scan_metrics(&data);
1203 }
1204 }
1205 }
1206}
1207
1208const SPLIT_ROW_THRESHOLD: u64 = DEFAULT_ROW_GROUP_SIZE as u64;
1210const NUM_SERIES_THRESHOLD: u64 = 10240;
1212const BATCH_SIZE_THRESHOLD: u64 = 50;
1215
1216pub(crate) fn should_split_flat_batches_for_merge(
1218 stream_ctx: &Arc<StreamContext>,
1219 range_meta: &RangeMeta,
1220) -> bool {
1221 let mut num_files_to_split = 0;
1223 let mut num_mem_rows = 0;
1224 let mut num_mem_series = 0;
1225 for index in &range_meta.row_group_indices {
1229 if stream_ctx.is_mem_range_index(*index) {
1230 let memtable = &stream_ctx.input.memtables[index.index];
1231 let stats = memtable.stats();
1233 num_mem_rows += stats.num_rows();
1234 num_mem_series += stats.series_count();
1235 } else if stream_ctx.is_file_range_index(*index) {
1236 let file_index = index.index - stream_ctx.input.num_memtables();
1238 let file = &stream_ctx.input.files[file_index];
1239 if file.meta_ref().num_rows < SPLIT_ROW_THRESHOLD || file.meta_ref().num_series == 0 {
1240 continue;
1242 }
1243 debug_assert!(file.meta_ref().num_rows > 0);
1244 if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) {
1245 return false;
1247 } else {
1248 num_files_to_split += 1;
1249 }
1250 }
1251 }
1253
1254 if num_files_to_split > 0 {
1255 true
1257 } else if num_mem_series > 0 && num_mem_rows > 0 {
1258 can_split_series(num_mem_rows as u64, num_mem_series as u64)
1260 } else {
1261 false
1262 }
1263}
1264
1265fn can_split_series(num_rows: u64, num_series: u64) -> bool {
1266 assert!(num_series > 0);
1267 assert!(num_rows > 0);
1268
1269 num_series < NUM_SERIES_THRESHOLD || num_rows / num_series >= BATCH_SIZE_THRESHOLD
1271}
1272
1273fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics {
1276 if explain_verbose {
1277 ReaderFilterMetrics {
1278 inverted_index_apply_metrics: Some(InvertedIndexApplyMetrics::default()),
1279 bloom_filter_apply_metrics: Some(BloomFilterIndexApplyMetrics::default()),
1280 fulltext_index_apply_metrics: Some(FulltextIndexApplyMetrics::default()),
1281 ..Default::default()
1282 }
1283 } else {
1284 ReaderFilterMetrics::default()
1285 }
1286}
1287
1288#[tracing::instrument(
1290 skip_all,
1291 fields(
1292 region_id = %stream_ctx.input.region_metadata().region_id,
1293 row_group_index = %index.index,
1294 source = read_type
1295 )
1296)]
1297pub(crate) async fn scan_file_ranges(
1298 stream_ctx: Arc<StreamContext>,
1299 part_metrics: PartitionMetrics,
1300 index: RowGroupIndex,
1301 read_type: &'static str,
1302 partition_pruner: Arc<PartitionPruner>,
1303) -> Result<impl Stream<Item = Result<Batch>>> {
1304 let mut reader_metrics = ReaderMetrics {
1305 filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
1306 ..Default::default()
1307 };
1308 let ranges = partition_pruner
1309 .build_file_ranges(index, &part_metrics, &mut reader_metrics)
1310 .await?;
1311 part_metrics.inc_num_file_ranges(ranges.len());
1312 part_metrics.merge_reader_metrics(&reader_metrics, None);
1313
1314 let init_per_file_metrics = if part_metrics.explain_verbose() {
1316 let file = stream_ctx.input.file_from_index(index);
1317 let file_id = file.file_id();
1318
1319 let mut map = HashMap::new();
1320 map.insert(
1321 file_id,
1322 FileScanMetrics {
1323 build_part_cost: reader_metrics.build_cost,
1324 ..Default::default()
1325 },
1326 );
1327 Some(map)
1328 } else {
1329 None
1330 };
1331
1332 Ok(build_file_range_scan_stream(
1333 stream_ctx,
1334 part_metrics,
1335 read_type,
1336 ranges,
1337 init_per_file_metrics,
1338 ))
1339}
1340
1341#[tracing::instrument(
1343 skip_all,
1344 fields(
1345 region_id = %stream_ctx.input.region_metadata().region_id,
1346 row_group_index = %index.index,
1347 source = read_type
1348 )
1349)]
1350pub(crate) async fn scan_flat_file_ranges(
1351 stream_ctx: Arc<StreamContext>,
1352 part_metrics: PartitionMetrics,
1353 index: RowGroupIndex,
1354 read_type: &'static str,
1355 partition_pruner: Arc<PartitionPruner>,
1356) -> Result<impl Stream<Item = Result<RecordBatch>>> {
1357 let mut reader_metrics = ReaderMetrics {
1358 filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
1359 ..Default::default()
1360 };
1361 let ranges = partition_pruner
1362 .build_file_ranges(index, &part_metrics, &mut reader_metrics)
1363 .await?;
1364 part_metrics.inc_num_file_ranges(ranges.len());
1365 part_metrics.merge_reader_metrics(&reader_metrics, None);
1366
1367 let init_per_file_metrics = if part_metrics.explain_verbose() {
1369 let file = stream_ctx.input.file_from_index(index);
1370 let file_id = file.file_id();
1371
1372 let mut map = HashMap::new();
1373 map.insert(
1374 file_id,
1375 FileScanMetrics {
1376 build_part_cost: reader_metrics.build_cost,
1377 ..Default::default()
1378 },
1379 );
1380 Some(map)
1381 } else {
1382 None
1383 };
1384
1385 Ok(build_flat_file_range_scan_stream(
1386 stream_ctx,
1387 part_metrics,
1388 read_type,
1389 ranges,
1390 init_per_file_metrics,
1391 ))
1392}
1393
1394#[tracing::instrument(
1396 skip_all,
1397 fields(read_type = read_type, range_count = ranges.len())
1398)]
1399pub fn build_file_range_scan_stream(
1400 stream_ctx: Arc<StreamContext>,
1401 part_metrics: PartitionMetrics,
1402 read_type: &'static str,
1403 ranges: SmallVec<[FileRange; 2]>,
1404 mut per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
1405) -> impl Stream<Item = Result<Batch>> {
1406 try_stream! {
1407 let fetch_metrics = if part_metrics.explain_verbose() {
1408 Some(Arc::new(ParquetFetchMetrics::default()))
1409 } else {
1410 None
1411 };
1412 let reader_metrics = &mut ReaderMetrics {
1413 fetch_metrics: fetch_metrics.clone(),
1414 ..Default::default()
1415 };
1416 for range in ranges {
1417 let build_reader_start = Instant::now();
1418 let Some(reader) = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await? else {
1419 continue;
1420 };
1421 let build_cost = build_reader_start.elapsed();
1422 part_metrics.inc_build_reader_cost(build_cost);
1423 let compat_batch = range.compat_batch();
1424 let mut source = Source::PruneReader(reader);
1425 while let Some(mut batch) = source.next_batch().await? {
1426 if let Some(compact_batch) = compat_batch {
1427 batch = compact_batch.as_primary_key().unwrap().compat_batch(batch)?;
1428 }
1429 yield batch;
1430 }
1431 if let Source::PruneReader(reader) = source {
1432 let prune_metrics = reader.metrics();
1433
1434 if let Some(file_metrics_map) = per_file_metrics.as_mut() {
1436 let file_id = range.file_handle().file_id();
1437 let file_metrics = file_metrics_map
1438 .entry(file_id)
1439 .or_insert_with(FileScanMetrics::default);
1440
1441 file_metrics.num_ranges += 1;
1442 file_metrics.num_rows += prune_metrics.num_rows;
1443 file_metrics.build_reader_cost += build_cost;
1444 file_metrics.scan_cost += prune_metrics.scan_cost;
1445 }
1446
1447 reader_metrics.merge_from(&prune_metrics);
1448 }
1449 }
1450
1451 reader_metrics.observe_rows(read_type);
1453 reader_metrics.filter_metrics.observe();
1454 part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref());
1455 }
1456}
1457
1458#[tracing::instrument(
1460 skip_all,
1461 fields(read_type = read_type, range_count = ranges.len())
1462)]
1463pub fn build_flat_file_range_scan_stream(
1464 _stream_ctx: Arc<StreamContext>,
1465 part_metrics: PartitionMetrics,
1466 read_type: &'static str,
1467 ranges: SmallVec<[FileRange; 2]>,
1468 mut per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
1469) -> impl Stream<Item = Result<RecordBatch>> {
1470 try_stream! {
1471 let fetch_metrics = if part_metrics.explain_verbose() {
1472 Some(Arc::new(ParquetFetchMetrics::default()))
1473 } else {
1474 None
1475 };
1476 let reader_metrics = &mut ReaderMetrics {
1477 fetch_metrics: fetch_metrics.clone(),
1478 ..Default::default()
1479 };
1480 for range in ranges {
1481 let build_reader_start = Instant::now();
1482 let Some(mut reader) = range.flat_reader(_stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await? else{continue};
1483 let build_cost = build_reader_start.elapsed();
1484 part_metrics.inc_build_reader_cost(build_cost);
1485
1486 let may_compat = range
1487 .compat_batch()
1488 .map(|compat| {
1489 compat.as_flat().context(UnexpectedSnafu {
1490 reason: "Invalid compat for flat format",
1491 })
1492 })
1493 .transpose()?;
1494
1495 let mapper = range.compaction_projection_mapper();
1496 while let Some(record_batch) = reader.next_batch()? {
1497 let record_batch = if let Some(mapper) = mapper {
1498 let batch = mapper.project(record_batch)?;
1499 batch
1500 } else {
1501 record_batch
1502 };
1503
1504 if let Some(flat_compat) = may_compat {
1505 let batch = flat_compat.compat(record_batch)?;
1506 yield batch;
1507 } else {
1508 yield record_batch;
1509 }
1510 }
1511
1512 let prune_metrics = reader.metrics();
1513
1514 if let Some(file_metrics_map) = per_file_metrics.as_mut() {
1516 let file_id = range.file_handle().file_id();
1517 let file_metrics = file_metrics_map
1518 .entry(file_id)
1519 .or_insert_with(FileScanMetrics::default);
1520
1521 file_metrics.num_ranges += 1;
1522 file_metrics.num_rows += prune_metrics.num_rows;
1523 file_metrics.build_reader_cost += build_cost;
1524 file_metrics.scan_cost += prune_metrics.scan_cost;
1525 }
1526
1527 reader_metrics.merge_from(&prune_metrics);
1528 }
1529
1530 reader_metrics.observe_rows(read_type);
1532 reader_metrics.filter_metrics.observe();
1533 part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref());
1534 }
1535}
1536
1537#[cfg(feature = "enterprise")]
1539pub(crate) async fn scan_extension_range(
1540 context: Arc<StreamContext>,
1541 index: RowGroupIndex,
1542 partition_metrics: PartitionMetrics,
1543) -> Result<BoxedBatchStream> {
1544 use snafu::ResultExt;
1545
1546 let range = context.input.extension_range(index.index);
1547 let reader = range.reader(context.as_ref());
1548 let stream = reader
1549 .read(context, partition_metrics, index)
1550 .await
1551 .context(crate::error::ScanExternalRangeSnafu)?;
1552 Ok(stream)
1553}
1554
1555pub(crate) async fn maybe_scan_other_ranges(
1556 context: &Arc<StreamContext>,
1557 index: RowGroupIndex,
1558 metrics: &PartitionMetrics,
1559) -> Result<BoxedBatchStream> {
1560 #[cfg(feature = "enterprise")]
1561 {
1562 scan_extension_range(context.clone(), index, metrics.clone()).await
1563 }
1564
1565 #[cfg(not(feature = "enterprise"))]
1566 {
1567 let _ = context;
1568 let _ = index;
1569 let _ = metrics;
1570
1571 crate::error::UnexpectedSnafu {
1572 reason: "no other ranges scannable",
1573 }
1574 .fail()
1575 }
1576}
1577
1578#[cfg(feature = "enterprise")]
1580pub(crate) async fn scan_flat_extension_range(
1581 context: Arc<StreamContext>,
1582 index: RowGroupIndex,
1583 partition_metrics: PartitionMetrics,
1584) -> Result<BoxedRecordBatchStream> {
1585 use snafu::ResultExt;
1586
1587 let range = context.input.extension_range(index.index);
1588 let reader = range.flat_reader(context.as_ref());
1589 let stream = reader
1590 .read(context, partition_metrics, index)
1591 .await
1592 .context(crate::error::ScanExternalRangeSnafu)?;
1593 Ok(stream)
1594}
1595
1596pub(crate) async fn maybe_scan_flat_other_ranges(
1597 context: &Arc<StreamContext>,
1598 index: RowGroupIndex,
1599 metrics: &PartitionMetrics,
1600) -> Result<BoxedRecordBatchStream> {
1601 #[cfg(feature = "enterprise")]
1602 {
1603 scan_flat_extension_range(context.clone(), index, metrics.clone()).await
1604 }
1605
1606 #[cfg(not(feature = "enterprise"))]
1607 {
1608 let _ = context;
1609 let _ = index;
1610 let _ = metrics;
1611
1612 crate::error::UnexpectedSnafu {
1613 reason: "no other ranges scannable in flat format",
1614 }
1615 .fail()
1616 }
1617}
1618
1619pub(crate) struct SplitRecordBatchStream<S> {
1621 inner: S,
1623 batches: VecDeque<RecordBatch>,
1625}
1626
1627impl<S> SplitRecordBatchStream<S> {
1628 pub(crate) fn new(inner: S) -> Self {
1630 Self {
1631 inner,
1632 batches: VecDeque::new(),
1633 }
1634 }
1635}
1636
1637impl<S> Stream for SplitRecordBatchStream<S>
1638where
1639 S: Stream<Item = Result<RecordBatch>> + Unpin,
1640{
1641 type Item = Result<RecordBatch>;
1642
1643 fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
1644 loop {
1645 if let Some(batch) = self.batches.pop_front() {
1647 return Poll::Ready(Some(Ok(batch)));
1648 }
1649
1650 let record_batch = match futures::ready!(Pin::new(&mut self.inner).poll_next(cx)) {
1652 Some(Ok(batch)) => batch,
1653 Some(Err(e)) => return Poll::Ready(Some(Err(e))),
1654 None => return Poll::Ready(None),
1655 };
1656
1657 split_record_batch(record_batch, &mut self.batches);
1659 }
1661 }
1662}
1663
1664pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeque<RecordBatch>) {
1669 let batch_rows = record_batch.num_rows();
1670 if batch_rows == 0 {
1671 return;
1672 }
1673 if batch_rows < 2 {
1674 batches.push_back(record_batch);
1675 return;
1676 }
1677
1678 let time_index_pos = time_index_column_index(record_batch.num_columns());
1679 let timestamps = record_batch.column(time_index_pos);
1680 let (ts_values, _unit) = timestamp_array_to_primitive(timestamps).unwrap();
1681 let mut offsets = Vec::with_capacity(16);
1682 offsets.push(0);
1683 let values = ts_values.values();
1684 for (i, &value) in values.iter().take(batch_rows - 1).enumerate() {
1685 if value > values[i + 1] {
1686 offsets.push(i + 1);
1687 }
1688 }
1689 offsets.push(values.len());
1690
1691 for (i, &start) in offsets[..offsets.len() - 1].iter().enumerate() {
1693 let end = offsets[i + 1];
1694 let rows_in_batch = end - start;
1695 batches.push_back(record_batch.slice(start, rows_in_batch));
1696 }
1697}