mito2/read/
scan_util.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Utilities for scanners.
16
17use std::collections::{BinaryHeap, HashMap, VecDeque};
18use std::fmt;
19use std::pin::Pin;
20use std::sync::{Arc, Mutex};
21use std::task::{Context, Poll};
22use std::time::{Duration, Instant};
23
24use async_stream::try_stream;
25use common_telemetry::tracing;
26use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time};
27use datatypes::arrow::record_batch::RecordBatch;
28use datatypes::timestamp::timestamp_array_to_primitive;
29use futures::Stream;
30use prometheus::IntGauge;
31use smallvec::SmallVec;
32use snafu::OptionExt;
33use store_api::storage::RegionId;
34
35use crate::error::{Result, UnexpectedSnafu};
36use crate::memtable::MemScanMetrics;
37use crate::metrics::{
38    IN_PROGRESS_SCAN, PRECISE_FILTER_ROWS_TOTAL, READ_BATCHES_RETURN, READ_ROW_GROUPS_TOTAL,
39    READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_RETURN, READ_STAGE_ELAPSED,
40};
41use crate::read::dedup::{DedupMetrics, DedupMetricsReport};
42use crate::read::merge::{MergeMetrics, MergeMetricsReport};
43use crate::read::pruner::PartitionPruner;
44use crate::read::range::{RangeMeta, RowGroupIndex};
45use crate::read::scan_region::StreamContext;
46use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source};
47use crate::sst::file::{FileTimeRange, RegionFileId};
48use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics;
49use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics;
50use crate::sst::index::inverted_index::applier::InvertedIndexApplyMetrics;
51use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
52use crate::sst::parquet::file_range::FileRange;
53use crate::sst::parquet::flat_format::time_index_column_index;
54use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderFilterMetrics, ReaderMetrics};
55use crate::sst::parquet::row_group::ParquetFetchMetrics;
56
57/// Per-file scan metrics.
58#[derive(Default, Clone)]
59pub struct FileScanMetrics {
60    /// Number of ranges (row groups) read from this file.
61    pub num_ranges: usize,
62    /// Number of rows read from this file.
63    pub num_rows: usize,
64    /// Time spent building file ranges/parts (file-level preparation).
65    pub build_part_cost: Duration,
66    /// Time spent building readers for this file (accumulated across all ranges).
67    pub build_reader_cost: Duration,
68    /// Time spent scanning this file (accumulated across all ranges).
69    pub scan_cost: Duration,
70}
71
72impl fmt::Debug for FileScanMetrics {
73    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74        write!(f, "{{\"build_part_cost\":\"{:?}\"", self.build_part_cost)?;
75
76        if self.num_ranges > 0 {
77            write!(f, ", \"num_ranges\":{}", self.num_ranges)?;
78        }
79        if self.num_rows > 0 {
80            write!(f, ", \"num_rows\":{}", self.num_rows)?;
81        }
82        if !self.build_reader_cost.is_zero() {
83            write!(
84                f,
85                ", \"build_reader_cost\":\"{:?}\"",
86                self.build_reader_cost
87            )?;
88        }
89        if !self.scan_cost.is_zero() {
90            write!(f, ", \"scan_cost\":\"{:?}\"", self.scan_cost)?;
91        }
92
93        write!(f, "}}")
94    }
95}
96
97impl FileScanMetrics {
98    /// Merges another FileMetrics into this one.
99    pub(crate) fn merge_from(&mut self, other: &FileScanMetrics) {
100        self.num_ranges += other.num_ranges;
101        self.num_rows += other.num_rows;
102        self.build_part_cost += other.build_part_cost;
103        self.build_reader_cost += other.build_reader_cost;
104        self.scan_cost += other.scan_cost;
105    }
106}
107
108/// Verbose scan metrics for a partition.
109#[derive(Default)]
110pub(crate) struct ScanMetricsSet {
111    /// Duration to prepare the scan task.
112    prepare_scan_cost: Duration,
113    /// Duration to build the (merge) reader.
114    build_reader_cost: Duration,
115    /// Duration to scan data.
116    scan_cost: Duration,
117    /// Duration while waiting for `yield`.
118    yield_cost: Duration,
119    /// Duration to convert [`Batch`]es.
120    convert_cost: Option<Time>,
121    /// Duration of the scan.
122    total_cost: Duration,
123    /// Number of rows returned.
124    num_rows: usize,
125    /// Number of batches returned.
126    num_batches: usize,
127    /// Number of mem ranges scanned.
128    num_mem_ranges: usize,
129    /// Number of file ranges scanned.
130    num_file_ranges: usize,
131
132    // Memtable related metrics:
133    /// Duration to scan memtables.
134    mem_scan_cost: Duration,
135    /// Number of rows read from memtables.
136    mem_rows: usize,
137    /// Number of batches read from memtables.
138    mem_batches: usize,
139    /// Number of series read from memtables.
140    mem_series: usize,
141
142    // SST related metrics:
143    /// Duration to build file ranges.
144    build_parts_cost: Duration,
145    /// Duration to scan SST files.
146    sst_scan_cost: Duration,
147    /// Number of row groups before filtering.
148    rg_total: usize,
149    /// Number of row groups filtered by fulltext index.
150    rg_fulltext_filtered: usize,
151    /// Number of row groups filtered by inverted index.
152    rg_inverted_filtered: usize,
153    /// Number of row groups filtered by min-max index.
154    rg_minmax_filtered: usize,
155    /// Number of row groups filtered by bloom filter index.
156    rg_bloom_filtered: usize,
157    /// Number of row groups filtered by vector index.
158    rg_vector_filtered: usize,
159    /// Number of rows in row group before filtering.
160    rows_before_filter: usize,
161    /// Number of rows in row group filtered by fulltext index.
162    rows_fulltext_filtered: usize,
163    /// Number of rows in row group filtered by inverted index.
164    rows_inverted_filtered: usize,
165    /// Number of rows in row group filtered by bloom filter index.
166    rows_bloom_filtered: usize,
167    /// Number of rows filtered by vector index.
168    rows_vector_filtered: usize,
169    /// Number of rows selected by vector index.
170    rows_vector_selected: usize,
171    /// Number of rows filtered by precise filter.
172    rows_precise_filtered: usize,
173    /// Number of index result cache hits for fulltext index.
174    fulltext_index_cache_hit: usize,
175    /// Number of index result cache misses for fulltext index.
176    fulltext_index_cache_miss: usize,
177    /// Number of index result cache hits for inverted index.
178    inverted_index_cache_hit: usize,
179    /// Number of index result cache misses for inverted index.
180    inverted_index_cache_miss: usize,
181    /// Number of index result cache hits for bloom filter index.
182    bloom_filter_cache_hit: usize,
183    /// Number of index result cache misses for bloom filter index.
184    bloom_filter_cache_miss: usize,
185    /// Number of index result cache hits for minmax pruning.
186    minmax_cache_hit: usize,
187    /// Number of index result cache misses for minmax pruning.
188    minmax_cache_miss: usize,
189    /// Number of pruner builder cache hits.
190    pruner_cache_hit: usize,
191    /// Number of pruner builder cache misses.
192    pruner_cache_miss: usize,
193    /// Duration spent waiting for pruner to build file ranges.
194    pruner_prune_cost: Duration,
195    /// Number of record batches read from SST.
196    num_sst_record_batches: usize,
197    /// Number of batches decoded from SST.
198    num_sst_batches: usize,
199    /// Number of rows read from SST.
200    num_sst_rows: usize,
201
202    /// Elapsed time before the first poll operation.
203    first_poll: Duration,
204
205    /// Number of send timeout in SeriesScan.
206    num_series_send_timeout: usize,
207    /// Number of send full in SeriesScan.
208    num_series_send_full: usize,
209    /// Number of rows the series distributor scanned.
210    num_distributor_rows: usize,
211    /// Number of batches the series distributor scanned.
212    num_distributor_batches: usize,
213    /// Duration of the series distributor to scan.
214    distributor_scan_cost: Duration,
215    /// Duration of the series distributor to yield.
216    distributor_yield_cost: Duration,
217    /// Duration spent in divider operations.
218    distributor_divider_cost: Duration,
219
220    /// Merge metrics.
221    merge_metrics: MergeMetrics,
222    /// Dedup metrics.
223    dedup_metrics: DedupMetrics,
224
225    /// The stream reached EOF
226    stream_eof: bool,
227
228    // Optional verbose metrics:
229    /// Inverted index apply metrics.
230    inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
231    /// Bloom filter index apply metrics.
232    bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
233    /// Fulltext index apply metrics.
234    fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
235    /// Parquet fetch metrics.
236    fetch_metrics: Option<ParquetFetchMetrics>,
237    /// Metadata cache metrics.
238    metadata_cache_metrics: Option<MetadataCacheMetrics>,
239    /// Per-file scan metrics, only populated when explain_verbose is true.
240    per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
241
242    /// Current memory usage for file range builders.
243    build_ranges_mem_size: isize,
244    /// Peak memory usage for file range builders.
245    build_ranges_peak_mem_size: isize,
246    /// Current number of file range builders.
247    num_range_builders: isize,
248    /// Peak number of file range builders.
249    num_peak_range_builders: isize,
250}
251
252/// Wrapper for file metrics that compares by total cost in reverse order.
253/// This allows using BinaryHeap as a min-heap for efficient top-K selection.
254struct CompareCostReverse<'a> {
255    total_cost: Duration,
256    file_id: RegionFileId,
257    metrics: &'a FileScanMetrics,
258}
259
260impl Ord for CompareCostReverse<'_> {
261    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
262        // Reverse comparison: smaller costs are "greater"
263        other.total_cost.cmp(&self.total_cost)
264    }
265}
266
267impl PartialOrd for CompareCostReverse<'_> {
268    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
269        Some(self.cmp(other))
270    }
271}
272
273impl Eq for CompareCostReverse<'_> {}
274
275impl PartialEq for CompareCostReverse<'_> {
276    fn eq(&self, other: &Self) -> bool {
277        self.total_cost == other.total_cost
278    }
279}
280
281impl fmt::Debug for ScanMetricsSet {
282    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283        let ScanMetricsSet {
284            prepare_scan_cost,
285            build_reader_cost,
286            scan_cost,
287            yield_cost,
288            convert_cost,
289            total_cost,
290            num_rows,
291            num_batches,
292            num_mem_ranges,
293            num_file_ranges,
294            build_parts_cost,
295            sst_scan_cost,
296            rg_total,
297            rg_fulltext_filtered,
298            rg_inverted_filtered,
299            rg_minmax_filtered,
300            rg_bloom_filtered,
301            rg_vector_filtered,
302            rows_before_filter,
303            rows_fulltext_filtered,
304            rows_inverted_filtered,
305            rows_bloom_filtered,
306            rows_vector_filtered,
307            rows_vector_selected,
308            rows_precise_filtered,
309            fulltext_index_cache_hit,
310            fulltext_index_cache_miss,
311            inverted_index_cache_hit,
312            inverted_index_cache_miss,
313            bloom_filter_cache_hit,
314            bloom_filter_cache_miss,
315            minmax_cache_hit,
316            minmax_cache_miss,
317            pruner_cache_hit,
318            pruner_cache_miss,
319            pruner_prune_cost,
320            num_sst_record_batches,
321            num_sst_batches,
322            num_sst_rows,
323            first_poll,
324            num_series_send_timeout,
325            num_series_send_full,
326            num_distributor_rows,
327            num_distributor_batches,
328            distributor_scan_cost,
329            distributor_yield_cost,
330            distributor_divider_cost,
331            merge_metrics,
332            dedup_metrics,
333            stream_eof,
334            mem_scan_cost,
335            mem_rows,
336            mem_batches,
337            mem_series,
338            inverted_index_apply_metrics,
339            bloom_filter_apply_metrics,
340            fulltext_index_apply_metrics,
341            fetch_metrics,
342            metadata_cache_metrics,
343            per_file_metrics,
344            build_ranges_mem_size: _,
345            build_ranges_peak_mem_size,
346            num_range_builders: _,
347            num_peak_range_builders,
348        } = self;
349
350        // Write core metrics
351        write!(
352            f,
353            "{{\"prepare_scan_cost\":\"{prepare_scan_cost:?}\", \
354            \"build_reader_cost\":\"{build_reader_cost:?}\", \
355            \"scan_cost\":\"{scan_cost:?}\", \
356            \"yield_cost\":\"{yield_cost:?}\", \
357            \"total_cost\":\"{total_cost:?}\", \
358            \"num_rows\":{num_rows}, \
359            \"num_batches\":{num_batches}, \
360            \"num_mem_ranges\":{num_mem_ranges}, \
361            \"num_file_ranges\":{num_file_ranges}, \
362            \"build_parts_cost\":\"{build_parts_cost:?}\", \
363            \"sst_scan_cost\":\"{sst_scan_cost:?}\", \
364            \"rg_total\":{rg_total}, \
365            \"rows_before_filter\":{rows_before_filter}, \
366            \"num_sst_record_batches\":{num_sst_record_batches}, \
367            \"num_sst_batches\":{num_sst_batches}, \
368            \"num_sst_rows\":{num_sst_rows}, \
369            \"first_poll\":\"{first_poll:?}\""
370        )?;
371
372        // Write convert_cost if present
373        if let Some(time) = convert_cost {
374            let duration = Duration::from_nanos(time.value() as u64);
375            write!(f, ", \"convert_cost\":\"{duration:?}\"")?;
376        }
377
378        // Write non-zero filter counters
379        if *rg_fulltext_filtered > 0 {
380            write!(f, ", \"rg_fulltext_filtered\":{rg_fulltext_filtered}")?;
381        }
382        if *rg_inverted_filtered > 0 {
383            write!(f, ", \"rg_inverted_filtered\":{rg_inverted_filtered}")?;
384        }
385        if *rg_minmax_filtered > 0 {
386            write!(f, ", \"rg_minmax_filtered\":{rg_minmax_filtered}")?;
387        }
388        if *rg_bloom_filtered > 0 {
389            write!(f, ", \"rg_bloom_filtered\":{rg_bloom_filtered}")?;
390        }
391        if *rg_vector_filtered > 0 {
392            write!(f, ", \"rg_vector_filtered\":{rg_vector_filtered}")?;
393        }
394        if *rows_fulltext_filtered > 0 {
395            write!(f, ", \"rows_fulltext_filtered\":{rows_fulltext_filtered}")?;
396        }
397        if *rows_inverted_filtered > 0 {
398            write!(f, ", \"rows_inverted_filtered\":{rows_inverted_filtered}")?;
399        }
400        if *rows_bloom_filtered > 0 {
401            write!(f, ", \"rows_bloom_filtered\":{rows_bloom_filtered}")?;
402        }
403        if *rows_vector_filtered > 0 {
404            write!(f, ", \"rows_vector_filtered\":{rows_vector_filtered}")?;
405        }
406        if *rows_vector_selected > 0 {
407            write!(f, ", \"rows_vector_selected\":{rows_vector_selected}")?;
408        }
409        if *rows_precise_filtered > 0 {
410            write!(f, ", \"rows_precise_filtered\":{rows_precise_filtered}")?;
411        }
412        if *fulltext_index_cache_hit > 0 {
413            write!(
414                f,
415                ", \"fulltext_index_cache_hit\":{fulltext_index_cache_hit}"
416            )?;
417        }
418        if *fulltext_index_cache_miss > 0 {
419            write!(
420                f,
421                ", \"fulltext_index_cache_miss\":{fulltext_index_cache_miss}"
422            )?;
423        }
424        if *inverted_index_cache_hit > 0 {
425            write!(
426                f,
427                ", \"inverted_index_cache_hit\":{inverted_index_cache_hit}"
428            )?;
429        }
430        if *inverted_index_cache_miss > 0 {
431            write!(
432                f,
433                ", \"inverted_index_cache_miss\":{inverted_index_cache_miss}"
434            )?;
435        }
436        if *bloom_filter_cache_hit > 0 {
437            write!(f, ", \"bloom_filter_cache_hit\":{bloom_filter_cache_hit}")?;
438        }
439        if *bloom_filter_cache_miss > 0 {
440            write!(f, ", \"bloom_filter_cache_miss\":{bloom_filter_cache_miss}")?;
441        }
442        if *minmax_cache_hit > 0 {
443            write!(f, ", \"minmax_cache_hit\":{minmax_cache_hit}")?;
444        }
445        if *minmax_cache_miss > 0 {
446            write!(f, ", \"minmax_cache_miss\":{minmax_cache_miss}")?;
447        }
448        if *pruner_cache_hit > 0 {
449            write!(f, ", \"pruner_cache_hit\":{pruner_cache_hit}")?;
450        }
451        if *pruner_cache_miss > 0 {
452            write!(f, ", \"pruner_cache_miss\":{pruner_cache_miss}")?;
453        }
454        if !pruner_prune_cost.is_zero() {
455            write!(f, ", \"pruner_prune_cost\":\"{pruner_prune_cost:?}\"")?;
456        }
457
458        // Write non-zero distributor metrics
459        if *num_series_send_timeout > 0 {
460            write!(f, ", \"num_series_send_timeout\":{num_series_send_timeout}")?;
461        }
462        if *num_series_send_full > 0 {
463            write!(f, ", \"num_series_send_full\":{num_series_send_full}")?;
464        }
465        if *num_distributor_rows > 0 {
466            write!(f, ", \"num_distributor_rows\":{num_distributor_rows}")?;
467        }
468        if *num_distributor_batches > 0 {
469            write!(f, ", \"num_distributor_batches\":{num_distributor_batches}")?;
470        }
471        if !distributor_scan_cost.is_zero() {
472            write!(
473                f,
474                ", \"distributor_scan_cost\":\"{distributor_scan_cost:?}\""
475            )?;
476        }
477        if !distributor_yield_cost.is_zero() {
478            write!(
479                f,
480                ", \"distributor_yield_cost\":\"{distributor_yield_cost:?}\""
481            )?;
482        }
483        if !distributor_divider_cost.is_zero() {
484            write!(
485                f,
486                ", \"distributor_divider_cost\":\"{distributor_divider_cost:?}\""
487            )?;
488        }
489
490        // Write non-zero memtable metrics
491        if *mem_rows > 0 {
492            write!(f, ", \"mem_rows\":{mem_rows}")?;
493        }
494        if *mem_batches > 0 {
495            write!(f, ", \"mem_batches\":{mem_batches}")?;
496        }
497        if *mem_series > 0 {
498            write!(f, ", \"mem_series\":{mem_series}")?;
499        }
500        if !mem_scan_cost.is_zero() {
501            write!(f, ", \"mem_scan_cost\":\"{mem_scan_cost:?}\"")?;
502        }
503
504        // Write optional verbose metrics if they are not empty
505        if let Some(metrics) = inverted_index_apply_metrics
506            && !metrics.is_empty()
507        {
508            write!(f, ", \"inverted_index_apply_metrics\":{:?}", metrics)?;
509        }
510        if let Some(metrics) = bloom_filter_apply_metrics
511            && !metrics.is_empty()
512        {
513            write!(f, ", \"bloom_filter_apply_metrics\":{:?}", metrics)?;
514        }
515        if let Some(metrics) = fulltext_index_apply_metrics
516            && !metrics.is_empty()
517        {
518            write!(f, ", \"fulltext_index_apply_metrics\":{:?}", metrics)?;
519        }
520        if let Some(metrics) = fetch_metrics
521            && !metrics.is_empty()
522        {
523            write!(f, ", \"fetch_metrics\":{:?}", metrics)?;
524        }
525        if let Some(metrics) = metadata_cache_metrics
526            && !metrics.is_empty()
527        {
528            write!(f, ", \"metadata_cache_metrics\":{:?}", metrics)?;
529        }
530
531        // Write merge metrics if not empty
532        if !merge_metrics.scan_cost.is_zero() {
533            write!(f, ", \"merge_metrics\":{:?}", merge_metrics)?;
534        }
535
536        // Write dedup metrics if not empty
537        if !dedup_metrics.dedup_cost.is_zero() {
538            write!(f, ", \"dedup_metrics\":{:?}", dedup_metrics)?;
539        }
540
541        // Write top file metrics if present and non-empty
542        if let Some(file_metrics) = per_file_metrics
543            && !file_metrics.is_empty()
544        {
545            // Use min-heap (BinaryHeap with reverse comparison) to keep only top 10
546            let mut heap = BinaryHeap::new();
547            for (file_id, metrics) in file_metrics.iter() {
548                let total_cost =
549                    metrics.build_part_cost + metrics.build_reader_cost + metrics.scan_cost;
550
551                // If the file has been pruned by a pruner, the build part cost may be zero.
552                // If we didn't read any ranges from it, we don't output the file.
553                if total_cost.is_zero() && metrics.num_ranges == 0 {
554                    continue;
555                }
556
557                if heap.len() < 10 {
558                    // Haven't reached 10 yet, just push
559                    heap.push(CompareCostReverse {
560                        total_cost,
561                        file_id: *file_id,
562                        metrics,
563                    });
564                } else if let Some(min_entry) = heap.peek() {
565                    // If current cost is higher than the minimum in our top-10, replace it
566                    if total_cost > min_entry.total_cost {
567                        heap.pop();
568                        heap.push(CompareCostReverse {
569                            total_cost,
570                            file_id: *file_id,
571                            metrics,
572                        });
573                    }
574                }
575            }
576
577            let top_files = heap.into_sorted_vec();
578            write!(f, ", \"top_file_metrics\": {{")?;
579            for (i, item) in top_files.iter().enumerate() {
580                let CompareCostReverse {
581                    total_cost: _,
582                    file_id,
583                    metrics,
584                } = item;
585                if i > 0 {
586                    write!(f, ", ")?;
587                }
588                write!(f, "\"{}\": {:?}", file_id, metrics)?;
589            }
590            write!(f, "}}")?;
591        }
592
593        write!(
594            f,
595            ", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \
596             \"num_peak_range_builders\":{num_peak_range_builders}, \
597             \"stream_eof\":{stream_eof}}}"
598        )
599    }
600}
601impl ScanMetricsSet {
602    /// Attaches the `prepare_scan_cost` to the metrics set.
603    fn with_prepare_scan_cost(mut self, cost: Duration) -> Self {
604        self.prepare_scan_cost += cost;
605        self
606    }
607
608    /// Attaches the `convert_cost` to the metrics set.
609    fn with_convert_cost(mut self, time: Time) -> Self {
610        self.convert_cost = Some(time);
611        self
612    }
613
614    /// Merges the local scanner metrics.
615    fn merge_scanner_metrics(&mut self, other: &ScannerMetrics) {
616        let ScannerMetrics {
617            scan_cost,
618            yield_cost,
619            num_batches,
620            num_rows,
621        } = other;
622
623        self.scan_cost += *scan_cost;
624        self.yield_cost += *yield_cost;
625        self.num_rows += *num_rows;
626        self.num_batches += *num_batches;
627    }
628
629    /// Merges the local reader metrics.
630    fn merge_reader_metrics(&mut self, other: &ReaderMetrics) {
631        let ReaderMetrics {
632            build_cost,
633            filter_metrics:
634                ReaderFilterMetrics {
635                    rg_total,
636                    rg_fulltext_filtered,
637                    rg_inverted_filtered,
638                    rg_minmax_filtered,
639                    rg_bloom_filtered,
640                    rg_vector_filtered,
641                    rows_total,
642                    rows_fulltext_filtered,
643                    rows_inverted_filtered,
644                    rows_bloom_filtered,
645                    rows_vector_filtered,
646                    rows_vector_selected,
647                    rows_precise_filtered,
648                    fulltext_index_cache_hit,
649                    fulltext_index_cache_miss,
650                    inverted_index_cache_hit,
651                    inverted_index_cache_miss,
652                    bloom_filter_cache_hit,
653                    bloom_filter_cache_miss,
654                    minmax_cache_hit,
655                    minmax_cache_miss,
656                    pruner_cache_hit,
657                    pruner_cache_miss,
658                    pruner_prune_cost,
659                    inverted_index_apply_metrics,
660                    bloom_filter_apply_metrics,
661                    fulltext_index_apply_metrics,
662                },
663            num_record_batches,
664            num_batches,
665            num_rows,
666            scan_cost,
667            metadata_cache_metrics,
668            fetch_metrics,
669            metadata_mem_size,
670            num_range_builders,
671        } = other;
672
673        self.build_parts_cost += *build_cost;
674        self.sst_scan_cost += *scan_cost;
675
676        self.rg_total += *rg_total;
677        self.rg_fulltext_filtered += *rg_fulltext_filtered;
678        self.rg_inverted_filtered += *rg_inverted_filtered;
679        self.rg_minmax_filtered += *rg_minmax_filtered;
680        self.rg_bloom_filtered += *rg_bloom_filtered;
681        self.rg_vector_filtered += *rg_vector_filtered;
682
683        self.rows_before_filter += *rows_total;
684        self.rows_fulltext_filtered += *rows_fulltext_filtered;
685        self.rows_inverted_filtered += *rows_inverted_filtered;
686        self.rows_bloom_filtered += *rows_bloom_filtered;
687        self.rows_vector_filtered += *rows_vector_filtered;
688        self.rows_vector_selected += *rows_vector_selected;
689        self.rows_precise_filtered += *rows_precise_filtered;
690
691        self.fulltext_index_cache_hit += *fulltext_index_cache_hit;
692        self.fulltext_index_cache_miss += *fulltext_index_cache_miss;
693        self.inverted_index_cache_hit += *inverted_index_cache_hit;
694        self.inverted_index_cache_miss += *inverted_index_cache_miss;
695        self.bloom_filter_cache_hit += *bloom_filter_cache_hit;
696        self.bloom_filter_cache_miss += *bloom_filter_cache_miss;
697        self.minmax_cache_hit += *minmax_cache_hit;
698        self.minmax_cache_miss += *minmax_cache_miss;
699        self.pruner_cache_hit += *pruner_cache_hit;
700        self.pruner_cache_miss += *pruner_cache_miss;
701        self.pruner_prune_cost += *pruner_prune_cost;
702
703        self.num_sst_record_batches += *num_record_batches;
704        self.num_sst_batches += *num_batches;
705        self.num_sst_rows += *num_rows;
706
707        // Merge optional verbose metrics
708        if let Some(metrics) = inverted_index_apply_metrics {
709            self.inverted_index_apply_metrics
710                .get_or_insert_with(InvertedIndexApplyMetrics::default)
711                .merge_from(metrics);
712        }
713        if let Some(metrics) = bloom_filter_apply_metrics {
714            self.bloom_filter_apply_metrics
715                .get_or_insert_with(BloomFilterIndexApplyMetrics::default)
716                .merge_from(metrics);
717        }
718        if let Some(metrics) = fulltext_index_apply_metrics {
719            self.fulltext_index_apply_metrics
720                .get_or_insert_with(FulltextIndexApplyMetrics::default)
721                .merge_from(metrics);
722        }
723        if let Some(metrics) = fetch_metrics {
724            self.fetch_metrics
725                .get_or_insert_with(ParquetFetchMetrics::default)
726                .merge_from(metrics);
727        }
728        self.metadata_cache_metrics
729            .get_or_insert_with(MetadataCacheMetrics::default)
730            .merge_from(metadata_cache_metrics);
731
732        // Track memory usage and update peak.
733        self.build_ranges_mem_size += *metadata_mem_size;
734        if self.build_ranges_mem_size > self.build_ranges_peak_mem_size {
735            self.build_ranges_peak_mem_size = self.build_ranges_mem_size;
736        }
737
738        // Track number of builders and update peak.
739        self.num_range_builders += *num_range_builders;
740        if self.num_range_builders > self.num_peak_range_builders {
741            self.num_peak_range_builders = self.num_range_builders;
742        }
743    }
744
745    /// Merges per-file metrics.
746    fn merge_per_file_metrics(&mut self, other: &HashMap<RegionFileId, FileScanMetrics>) {
747        let self_file_metrics = self.per_file_metrics.get_or_insert_with(HashMap::new);
748        for (file_id, metrics) in other {
749            self_file_metrics
750                .entry(*file_id)
751                .or_default()
752                .merge_from(metrics);
753        }
754    }
755
756    /// Sets distributor metrics.
757    fn set_distributor_metrics(&mut self, distributor_metrics: &SeriesDistributorMetrics) {
758        let SeriesDistributorMetrics {
759            num_series_send_timeout,
760            num_series_send_full,
761            num_rows,
762            num_batches,
763            scan_cost,
764            yield_cost,
765            divider_cost,
766        } = distributor_metrics;
767
768        self.num_series_send_timeout += *num_series_send_timeout;
769        self.num_series_send_full += *num_series_send_full;
770        self.num_distributor_rows += *num_rows;
771        self.num_distributor_batches += *num_batches;
772        self.distributor_scan_cost += *scan_cost;
773        self.distributor_yield_cost += *yield_cost;
774        self.distributor_divider_cost += *divider_cost;
775    }
776
777    /// Observes metrics.
778    fn observe_metrics(&self) {
779        READ_STAGE_ELAPSED
780            .with_label_values(&["prepare_scan"])
781            .observe(self.prepare_scan_cost.as_secs_f64());
782        READ_STAGE_ELAPSED
783            .with_label_values(&["build_reader"])
784            .observe(self.build_reader_cost.as_secs_f64());
785        READ_STAGE_ELAPSED
786            .with_label_values(&["scan"])
787            .observe(self.scan_cost.as_secs_f64());
788        READ_STAGE_ELAPSED
789            .with_label_values(&["yield"])
790            .observe(self.yield_cost.as_secs_f64());
791        if let Some(time) = &self.convert_cost {
792            READ_STAGE_ELAPSED
793                .with_label_values(&["convert"])
794                .observe(Duration::from_nanos(time.value() as u64).as_secs_f64());
795        }
796        READ_STAGE_ELAPSED
797            .with_label_values(&["total"])
798            .observe(self.total_cost.as_secs_f64());
799        READ_ROWS_RETURN.observe(self.num_rows as f64);
800        READ_BATCHES_RETURN.observe(self.num_batches as f64);
801
802        READ_STAGE_ELAPSED
803            .with_label_values(&["build_parts"])
804            .observe(self.build_parts_cost.as_secs_f64());
805
806        READ_ROW_GROUPS_TOTAL
807            .with_label_values(&["before_filtering"])
808            .inc_by(self.rg_total as u64);
809        READ_ROW_GROUPS_TOTAL
810            .with_label_values(&["fulltext_index_filtered"])
811            .inc_by(self.rg_fulltext_filtered as u64);
812        READ_ROW_GROUPS_TOTAL
813            .with_label_values(&["inverted_index_filtered"])
814            .inc_by(self.rg_inverted_filtered as u64);
815        READ_ROW_GROUPS_TOTAL
816            .with_label_values(&["minmax_index_filtered"])
817            .inc_by(self.rg_minmax_filtered as u64);
818        READ_ROW_GROUPS_TOTAL
819            .with_label_values(&["bloom_filter_index_filtered"])
820            .inc_by(self.rg_bloom_filtered as u64);
821        #[cfg(feature = "vector_index")]
822        READ_ROW_GROUPS_TOTAL
823            .with_label_values(&["vector_index_filtered"])
824            .inc_by(self.rg_vector_filtered as u64);
825
826        PRECISE_FILTER_ROWS_TOTAL
827            .with_label_values(&["parquet"])
828            .inc_by(self.rows_precise_filtered as u64);
829        READ_ROWS_IN_ROW_GROUP_TOTAL
830            .with_label_values(&["before_filtering"])
831            .inc_by(self.rows_before_filter as u64);
832        READ_ROWS_IN_ROW_GROUP_TOTAL
833            .with_label_values(&["fulltext_index_filtered"])
834            .inc_by(self.rows_fulltext_filtered as u64);
835        READ_ROWS_IN_ROW_GROUP_TOTAL
836            .with_label_values(&["inverted_index_filtered"])
837            .inc_by(self.rows_inverted_filtered as u64);
838        READ_ROWS_IN_ROW_GROUP_TOTAL
839            .with_label_values(&["bloom_filter_index_filtered"])
840            .inc_by(self.rows_bloom_filtered as u64);
841        #[cfg(feature = "vector_index")]
842        READ_ROWS_IN_ROW_GROUP_TOTAL
843            .with_label_values(&["vector_index_filtered"])
844            .inc_by(self.rows_vector_filtered as u64);
845    }
846}
847
848struct PartitionMetricsInner {
849    region_id: RegionId,
850    /// Index of the partition to scan.
851    partition: usize,
852    /// Label to distinguish different scan operation.
853    scanner_type: &'static str,
854    /// Query start time.
855    query_start: Instant,
856    /// Whether to use verbose logging.
857    explain_verbose: bool,
858    /// Verbose scan metrics that only log to debug logs by default.
859    metrics: Mutex<ScanMetricsSet>,
860    in_progress_scan: IntGauge,
861
862    // Normal metrics that always report to the [ExecutionPlanMetricsSet]:
863    /// Duration to build file ranges.
864    build_parts_cost: Time,
865    /// Duration to build the (merge) reader.
866    build_reader_cost: Time,
867    /// Duration to scan data.
868    scan_cost: Time,
869    /// Duration while waiting for `yield`.
870    yield_cost: Time,
871    /// Duration to convert [`Batch`]es.
872    convert_cost: Time,
873    /// Aggregated compute time reported to DataFusion.
874    elapsed_compute: Time,
875}
876
877impl PartitionMetricsInner {
878    fn on_finish(&self, stream_eof: bool) {
879        let mut metrics = self.metrics.lock().unwrap();
880        if metrics.total_cost.is_zero() {
881            metrics.total_cost = self.query_start.elapsed();
882        }
883        if !metrics.stream_eof {
884            metrics.stream_eof = stream_eof;
885        }
886    }
887}
888
889impl MergeMetricsReport for PartitionMetricsInner {
890    fn report(&self, metrics: &mut MergeMetrics) {
891        let mut scan_metrics = self.metrics.lock().unwrap();
892        // Merge the metrics into scan_metrics
893        scan_metrics.merge_metrics.merge(metrics);
894
895        // Reset the input metrics
896        *metrics = MergeMetrics::default();
897    }
898}
899
900impl DedupMetricsReport for PartitionMetricsInner {
901    fn report(&self, metrics: &mut DedupMetrics) {
902        let mut scan_metrics = self.metrics.lock().unwrap();
903        // Merge the metrics into scan_metrics
904        scan_metrics.dedup_metrics.merge(metrics);
905
906        // Reset the input metrics
907        *metrics = DedupMetrics::default();
908    }
909}
910
911impl Drop for PartitionMetricsInner {
912    fn drop(&mut self) {
913        self.on_finish(false);
914        let metrics = self.metrics.lock().unwrap();
915        metrics.observe_metrics();
916        self.in_progress_scan.dec();
917
918        if self.explain_verbose {
919            common_telemetry::info!(
920                "{} finished, region_id: {}, partition: {}, scan_metrics: {:?}",
921                self.scanner_type,
922                self.region_id,
923                self.partition,
924                metrics,
925            );
926        } else {
927            common_telemetry::debug!(
928                "{} finished, region_id: {}, partition: {}, scan_metrics: {:?}",
929                self.scanner_type,
930                self.region_id,
931                self.partition,
932                metrics,
933            );
934        }
935    }
936}
937
938/// List of PartitionMetrics.
939#[derive(Default)]
940pub(crate) struct PartitionMetricsList(Mutex<Vec<Option<PartitionMetrics>>>);
941
942impl PartitionMetricsList {
943    /// Sets a new [PartitionMetrics] at the specified partition.
944    pub(crate) fn set(&self, partition: usize, metrics: PartitionMetrics) {
945        let mut list = self.0.lock().unwrap();
946        if list.len() <= partition {
947            list.resize(partition + 1, None);
948        }
949        list[partition] = Some(metrics);
950    }
951
952    /// Format verbose metrics for each partition for explain.
953    pub(crate) fn format_verbose_metrics(&self, f: &mut fmt::Formatter) -> fmt::Result {
954        let list = self.0.lock().unwrap();
955        write!(f, ", \"metrics_per_partition\": ")?;
956        f.debug_list()
957            .entries(list.iter().filter_map(|p| p.as_ref()))
958            .finish()?;
959        write!(f, "}}")
960    }
961}
962
963/// Metrics while reading a partition.
964#[derive(Clone)]
965pub struct PartitionMetrics(Arc<PartitionMetricsInner>);
966
967impl PartitionMetrics {
968    pub(crate) fn new(
969        region_id: RegionId,
970        partition: usize,
971        scanner_type: &'static str,
972        query_start: Instant,
973        explain_verbose: bool,
974        metrics_set: &ExecutionPlanMetricsSet,
975    ) -> Self {
976        let partition_str = partition.to_string();
977        let in_progress_scan = IN_PROGRESS_SCAN.with_label_values(&[scanner_type, &partition_str]);
978        in_progress_scan.inc();
979        let convert_cost = MetricBuilder::new(metrics_set).subset_time("convert_cost", partition);
980        let metrics = ScanMetricsSet::default()
981            .with_prepare_scan_cost(query_start.elapsed())
982            .with_convert_cost(convert_cost.clone());
983        let inner = PartitionMetricsInner {
984            region_id,
985            partition,
986            scanner_type,
987            query_start,
988            explain_verbose,
989            metrics: Mutex::new(metrics),
990            in_progress_scan,
991            build_parts_cost: MetricBuilder::new(metrics_set)
992                .subset_time("build_parts_cost", partition),
993            build_reader_cost: MetricBuilder::new(metrics_set)
994                .subset_time("build_reader_cost", partition),
995            scan_cost: MetricBuilder::new(metrics_set).subset_time("scan_cost", partition),
996            yield_cost: MetricBuilder::new(metrics_set).subset_time("yield_cost", partition),
997            convert_cost,
998            elapsed_compute: MetricBuilder::new(metrics_set).elapsed_compute(partition),
999        };
1000        Self(Arc::new(inner))
1001    }
1002
1003    pub(crate) fn on_first_poll(&self) {
1004        let mut metrics = self.0.metrics.lock().unwrap();
1005        metrics.first_poll = self.0.query_start.elapsed();
1006    }
1007
1008    pub(crate) fn inc_num_mem_ranges(&self, num: usize) {
1009        let mut metrics = self.0.metrics.lock().unwrap();
1010        metrics.num_mem_ranges += num;
1011    }
1012
1013    pub fn inc_num_file_ranges(&self, num: usize) {
1014        let mut metrics = self.0.metrics.lock().unwrap();
1015        metrics.num_file_ranges += num;
1016    }
1017
1018    fn record_elapsed_compute(&self, duration: Duration) {
1019        if duration.is_zero() {
1020            return;
1021        }
1022        self.0.elapsed_compute.add_duration(duration);
1023    }
1024
1025    /// Merges `build_reader_cost`.
1026    pub(crate) fn inc_build_reader_cost(&self, cost: Duration) {
1027        self.0.build_reader_cost.add_duration(cost);
1028
1029        let mut metrics = self.0.metrics.lock().unwrap();
1030        metrics.build_reader_cost += cost;
1031    }
1032
1033    pub(crate) fn inc_convert_batch_cost(&self, cost: Duration) {
1034        self.0.convert_cost.add_duration(cost);
1035        self.record_elapsed_compute(cost);
1036    }
1037
1038    /// Reports memtable scan metrics.
1039    pub(crate) fn report_mem_scan_metrics(&self, data: &crate::memtable::MemScanMetricsData) {
1040        let mut metrics = self.0.metrics.lock().unwrap();
1041        metrics.mem_scan_cost += data.scan_cost;
1042        metrics.mem_rows += data.num_rows;
1043        metrics.mem_batches += data.num_batches;
1044        metrics.mem_series += data.total_series;
1045    }
1046
1047    /// Merges [ScannerMetrics], `build_reader_cost`, `scan_cost` and `yield_cost`.
1048    pub(crate) fn merge_metrics(&self, metrics: &ScannerMetrics) {
1049        self.0.scan_cost.add_duration(metrics.scan_cost);
1050        self.record_elapsed_compute(metrics.scan_cost);
1051        self.0.yield_cost.add_duration(metrics.yield_cost);
1052        self.record_elapsed_compute(metrics.yield_cost);
1053
1054        let mut metrics_set = self.0.metrics.lock().unwrap();
1055        metrics_set.merge_scanner_metrics(metrics);
1056    }
1057
1058    /// Merges [ReaderMetrics] and `build_reader_cost`.
1059    pub fn merge_reader_metrics(
1060        &self,
1061        metrics: &ReaderMetrics,
1062        per_file_metrics: Option<&HashMap<RegionFileId, FileScanMetrics>>,
1063    ) {
1064        self.0.build_parts_cost.add_duration(metrics.build_cost);
1065
1066        let mut metrics_set = self.0.metrics.lock().unwrap();
1067        metrics_set.merge_reader_metrics(metrics);
1068
1069        // Merge per-file metrics if provided
1070        if let Some(file_metrics) = per_file_metrics {
1071            metrics_set.merge_per_file_metrics(file_metrics);
1072        }
1073    }
1074
1075    /// Finishes the query.
1076    pub(crate) fn on_finish(&self) {
1077        self.0.on_finish(true);
1078    }
1079
1080    /// Sets the distributor metrics.
1081    pub(crate) fn set_distributor_metrics(&self, metrics: &SeriesDistributorMetrics) {
1082        let mut metrics_set = self.0.metrics.lock().unwrap();
1083        metrics_set.set_distributor_metrics(metrics);
1084    }
1085
1086    /// Returns whether verbose explain is enabled.
1087    pub(crate) fn explain_verbose(&self) -> bool {
1088        self.0.explain_verbose
1089    }
1090
1091    /// Returns a MergeMetricsReport trait object for reporting merge metrics.
1092    pub(crate) fn merge_metrics_reporter(&self) -> Arc<dyn MergeMetricsReport> {
1093        self.0.clone()
1094    }
1095
1096    /// Returns a DedupMetricsReport trait object for reporting dedup metrics.
1097    pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
1098        self.0.clone()
1099    }
1100}
1101
1102impl fmt::Debug for PartitionMetrics {
1103    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1104        let metrics = self.0.metrics.lock().unwrap();
1105        write!(
1106            f,
1107            r#"{{"partition":{}, "metrics":{:?}}}"#,
1108            self.0.partition, metrics
1109        )
1110    }
1111}
1112
1113/// Metrics for the series distributor.
1114#[derive(Default)]
1115pub(crate) struct SeriesDistributorMetrics {
1116    /// Number of send timeout in SeriesScan.
1117    pub(crate) num_series_send_timeout: usize,
1118    /// Number of send full in SeriesScan.
1119    pub(crate) num_series_send_full: usize,
1120    /// Number of rows the series distributor scanned.
1121    pub(crate) num_rows: usize,
1122    /// Number of batches the series distributor scanned.
1123    pub(crate) num_batches: usize,
1124    /// Duration of the series distributor to scan.
1125    pub(crate) scan_cost: Duration,
1126    /// Duration of the series distributor to yield.
1127    pub(crate) yield_cost: Duration,
1128    /// Duration spent in divider operations.
1129    pub(crate) divider_cost: Duration,
1130}
1131
1132/// Scans memtable ranges at `index`.
1133#[tracing::instrument(
1134    skip_all,
1135    fields(
1136        region_id = %stream_ctx.input.region_metadata().region_id,
1137        file_or_mem_index = %index.index,
1138        row_group_index = %index.row_group_index,
1139        source = "mem"
1140    )
1141)]
1142pub(crate) fn scan_mem_ranges(
1143    stream_ctx: Arc<StreamContext>,
1144    part_metrics: PartitionMetrics,
1145    index: RowGroupIndex,
1146    time_range: FileTimeRange,
1147) -> impl Stream<Item = Result<Batch>> {
1148    try_stream! {
1149        let ranges = stream_ctx.input.build_mem_ranges(index);
1150        part_metrics.inc_num_mem_ranges(ranges.len());
1151        for range in ranges {
1152            let build_reader_start = Instant::now();
1153            let mem_scan_metrics = Some(MemScanMetrics::default());
1154            let iter = range.build_prune_iter(time_range, mem_scan_metrics.clone())?;
1155            part_metrics.inc_build_reader_cost(build_reader_start.elapsed());
1156
1157            let mut source = Source::Iter(iter);
1158            while let Some(batch) = source.next_batch().await? {
1159                yield batch;
1160            }
1161
1162            // Report the memtable scan metrics to partition metrics
1163            if let Some(ref metrics) = mem_scan_metrics {
1164                let data = metrics.data();
1165                part_metrics.report_mem_scan_metrics(&data);
1166            }
1167        }
1168    }
1169}
1170
1171/// Scans memtable ranges at `index` using flat format that returns RecordBatch.
1172#[tracing::instrument(
1173    skip_all,
1174    fields(
1175        region_id = %stream_ctx.input.region_metadata().region_id,
1176        row_group_index = %index.index,
1177        source = "mem_flat"
1178    )
1179)]
1180pub(crate) fn scan_flat_mem_ranges(
1181    stream_ctx: Arc<StreamContext>,
1182    part_metrics: PartitionMetrics,
1183    index: RowGroupIndex,
1184    time_range: FileTimeRange,
1185) -> impl Stream<Item = Result<RecordBatch>> {
1186    try_stream! {
1187        let ranges = stream_ctx.input.build_mem_ranges(index);
1188        part_metrics.inc_num_mem_ranges(ranges.len());
1189        for range in ranges {
1190            let build_reader_start = Instant::now();
1191            let mem_scan_metrics = Some(MemScanMetrics::default());
1192            let mut iter = range.build_record_batch_iter(Some(time_range), mem_scan_metrics.clone())?;
1193            part_metrics.inc_build_reader_cost(build_reader_start.elapsed());
1194
1195            while let Some(record_batch) = iter.next().transpose()? {
1196                yield record_batch;
1197            }
1198
1199            // Report the memtable scan metrics to partition metrics
1200            if let Some(ref metrics) = mem_scan_metrics {
1201                let data = metrics.data();
1202                part_metrics.report_mem_scan_metrics(&data);
1203            }
1204        }
1205    }
1206}
1207
1208/// Files with row count greater than this threshold can contribute to the estimation.
1209const SPLIT_ROW_THRESHOLD: u64 = DEFAULT_ROW_GROUP_SIZE as u64;
1210/// Number of series threshold for splitting batches.
1211const NUM_SERIES_THRESHOLD: u64 = 10240;
1212/// Minimum batch size after splitting. The batch size is less than 60 because a series may only have
1213/// 60 samples per hour.
1214const BATCH_SIZE_THRESHOLD: u64 = 50;
1215
1216/// Returns true if splitting flat record batches may improve merge performance.
1217pub(crate) fn should_split_flat_batches_for_merge(
1218    stream_ctx: &Arc<StreamContext>,
1219    range_meta: &RangeMeta,
1220) -> bool {
1221    // Number of files to split and scan.
1222    let mut num_files_to_split = 0;
1223    let mut num_mem_rows = 0;
1224    let mut num_mem_series = 0;
1225    // Checks each file range, returns early if any range is not splittable.
1226    // For mem ranges, we collect the total number of rows and series because the number of rows in a
1227    // mem range may be too small.
1228    for index in &range_meta.row_group_indices {
1229        if stream_ctx.is_mem_range_index(*index) {
1230            let memtable = &stream_ctx.input.memtables[index.index];
1231            // Is mem range
1232            let stats = memtable.stats();
1233            num_mem_rows += stats.num_rows();
1234            num_mem_series += stats.series_count();
1235        } else if stream_ctx.is_file_range_index(*index) {
1236            // This is a file range.
1237            let file_index = index.index - stream_ctx.input.num_memtables();
1238            let file = &stream_ctx.input.files[file_index];
1239            if file.meta_ref().num_rows < SPLIT_ROW_THRESHOLD || file.meta_ref().num_series == 0 {
1240                // If the file doesn't have enough rows, or the number of series is unavailable, skips it.
1241                continue;
1242            }
1243            debug_assert!(file.meta_ref().num_rows > 0);
1244            if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) {
1245                // We can't split batches in a file.
1246                return false;
1247            } else {
1248                num_files_to_split += 1;
1249            }
1250        }
1251        // Skips non-file and non-mem ranges.
1252    }
1253
1254    if num_files_to_split > 0 {
1255        // We mainly consider file ranges because they have enough data for sampling.
1256        true
1257    } else if num_mem_series > 0 && num_mem_rows > 0 {
1258        // If we don't have files to scan, we check whether to split by the memtable.
1259        can_split_series(num_mem_rows as u64, num_mem_series as u64)
1260    } else {
1261        false
1262    }
1263}
1264
1265fn can_split_series(num_rows: u64, num_series: u64) -> bool {
1266    assert!(num_series > 0);
1267    assert!(num_rows > 0);
1268
1269    // It doesn't have too many series or it will have enough rows for each batch.
1270    num_series < NUM_SERIES_THRESHOLD || num_rows / num_series >= BATCH_SIZE_THRESHOLD
1271}
1272
1273/// Creates a new [ReaderFilterMetrics] with optional apply metrics initialized
1274/// based on the `explain_verbose` flag.
1275fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics {
1276    if explain_verbose {
1277        ReaderFilterMetrics {
1278            inverted_index_apply_metrics: Some(InvertedIndexApplyMetrics::default()),
1279            bloom_filter_apply_metrics: Some(BloomFilterIndexApplyMetrics::default()),
1280            fulltext_index_apply_metrics: Some(FulltextIndexApplyMetrics::default()),
1281            ..Default::default()
1282        }
1283    } else {
1284        ReaderFilterMetrics::default()
1285    }
1286}
1287
1288/// Scans file ranges at `index`.
1289#[tracing::instrument(
1290    skip_all,
1291    fields(
1292        region_id = %stream_ctx.input.region_metadata().region_id,
1293        row_group_index = %index.index,
1294        source = read_type
1295    )
1296)]
1297pub(crate) async fn scan_file_ranges(
1298    stream_ctx: Arc<StreamContext>,
1299    part_metrics: PartitionMetrics,
1300    index: RowGroupIndex,
1301    read_type: &'static str,
1302    partition_pruner: Arc<PartitionPruner>,
1303) -> Result<impl Stream<Item = Result<Batch>>> {
1304    let mut reader_metrics = ReaderMetrics {
1305        filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
1306        ..Default::default()
1307    };
1308    let ranges = partition_pruner
1309        .build_file_ranges(index, &part_metrics, &mut reader_metrics)
1310        .await?;
1311    part_metrics.inc_num_file_ranges(ranges.len());
1312    part_metrics.merge_reader_metrics(&reader_metrics, None);
1313
1314    // Creates initial per-file metrics with build_part_cost.
1315    let init_per_file_metrics = if part_metrics.explain_verbose() {
1316        let file = stream_ctx.input.file_from_index(index);
1317        let file_id = file.file_id();
1318
1319        let mut map = HashMap::new();
1320        map.insert(
1321            file_id,
1322            FileScanMetrics {
1323                build_part_cost: reader_metrics.build_cost,
1324                ..Default::default()
1325            },
1326        );
1327        Some(map)
1328    } else {
1329        None
1330    };
1331
1332    Ok(build_file_range_scan_stream(
1333        stream_ctx,
1334        part_metrics,
1335        read_type,
1336        ranges,
1337        init_per_file_metrics,
1338    ))
1339}
1340
1341/// Scans file ranges at `index` using flat reader that returns RecordBatch.
1342#[tracing::instrument(
1343    skip_all,
1344    fields(
1345        region_id = %stream_ctx.input.region_metadata().region_id,
1346        row_group_index = %index.index,
1347        source = read_type
1348    )
1349)]
1350pub(crate) async fn scan_flat_file_ranges(
1351    stream_ctx: Arc<StreamContext>,
1352    part_metrics: PartitionMetrics,
1353    index: RowGroupIndex,
1354    read_type: &'static str,
1355    partition_pruner: Arc<PartitionPruner>,
1356) -> Result<impl Stream<Item = Result<RecordBatch>>> {
1357    let mut reader_metrics = ReaderMetrics {
1358        filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
1359        ..Default::default()
1360    };
1361    let ranges = partition_pruner
1362        .build_file_ranges(index, &part_metrics, &mut reader_metrics)
1363        .await?;
1364    part_metrics.inc_num_file_ranges(ranges.len());
1365    part_metrics.merge_reader_metrics(&reader_metrics, None);
1366
1367    // Creates initial per-file metrics with build_part_cost.
1368    let init_per_file_metrics = if part_metrics.explain_verbose() {
1369        let file = stream_ctx.input.file_from_index(index);
1370        let file_id = file.file_id();
1371
1372        let mut map = HashMap::new();
1373        map.insert(
1374            file_id,
1375            FileScanMetrics {
1376                build_part_cost: reader_metrics.build_cost,
1377                ..Default::default()
1378            },
1379        );
1380        Some(map)
1381    } else {
1382        None
1383    };
1384
1385    Ok(build_flat_file_range_scan_stream(
1386        stream_ctx,
1387        part_metrics,
1388        read_type,
1389        ranges,
1390        init_per_file_metrics,
1391    ))
1392}
1393
1394/// Build the stream of scanning the input [`FileRange`]s.
1395#[tracing::instrument(
1396    skip_all,
1397    fields(read_type = read_type, range_count = ranges.len())
1398)]
1399pub fn build_file_range_scan_stream(
1400    stream_ctx: Arc<StreamContext>,
1401    part_metrics: PartitionMetrics,
1402    read_type: &'static str,
1403    ranges: SmallVec<[FileRange; 2]>,
1404    mut per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
1405) -> impl Stream<Item = Result<Batch>> {
1406    try_stream! {
1407        let fetch_metrics = if part_metrics.explain_verbose() {
1408            Some(Arc::new(ParquetFetchMetrics::default()))
1409        } else {
1410            None
1411        };
1412        let reader_metrics = &mut ReaderMetrics {
1413            fetch_metrics: fetch_metrics.clone(),
1414            ..Default::default()
1415        };
1416        for range in ranges {
1417            let build_reader_start = Instant::now();
1418            let Some(reader) = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await? else {
1419                continue;
1420            };
1421            let build_cost = build_reader_start.elapsed();
1422            part_metrics.inc_build_reader_cost(build_cost);
1423            let compat_batch = range.compat_batch();
1424            let mut source = Source::PruneReader(reader);
1425            while let Some(mut batch) = source.next_batch().await? {
1426                if let Some(compact_batch) = compat_batch {
1427                    batch = compact_batch.as_primary_key().unwrap().compat_batch(batch)?;
1428                }
1429                yield batch;
1430            }
1431            if let Source::PruneReader(reader) = source {
1432                let prune_metrics = reader.metrics();
1433
1434                // Update per-file metrics if tracking is enabled
1435                if let Some(file_metrics_map) = per_file_metrics.as_mut() {
1436                    let file_id = range.file_handle().file_id();
1437                    let file_metrics = file_metrics_map
1438                        .entry(file_id)
1439                        .or_insert_with(FileScanMetrics::default);
1440
1441                    file_metrics.num_ranges += 1;
1442                    file_metrics.num_rows += prune_metrics.num_rows;
1443                    file_metrics.build_reader_cost += build_cost;
1444                    file_metrics.scan_cost += prune_metrics.scan_cost;
1445                }
1446
1447                reader_metrics.merge_from(&prune_metrics);
1448            }
1449        }
1450
1451        // Reports metrics.
1452        reader_metrics.observe_rows(read_type);
1453        reader_metrics.filter_metrics.observe();
1454        part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref());
1455    }
1456}
1457
1458/// Build the stream of scanning the input [`FileRange`]s using flat reader that returns RecordBatch.
1459#[tracing::instrument(
1460    skip_all,
1461    fields(read_type = read_type, range_count = ranges.len())
1462)]
1463pub fn build_flat_file_range_scan_stream(
1464    _stream_ctx: Arc<StreamContext>,
1465    part_metrics: PartitionMetrics,
1466    read_type: &'static str,
1467    ranges: SmallVec<[FileRange; 2]>,
1468    mut per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
1469) -> impl Stream<Item = Result<RecordBatch>> {
1470    try_stream! {
1471        let fetch_metrics = if part_metrics.explain_verbose() {
1472            Some(Arc::new(ParquetFetchMetrics::default()))
1473        } else {
1474            None
1475        };
1476        let reader_metrics = &mut ReaderMetrics {
1477            fetch_metrics: fetch_metrics.clone(),
1478            ..Default::default()
1479        };
1480        for range in ranges {
1481            let build_reader_start = Instant::now();
1482            let Some(mut reader) = range.flat_reader(_stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await? else{continue};
1483            let build_cost = build_reader_start.elapsed();
1484            part_metrics.inc_build_reader_cost(build_cost);
1485
1486            let may_compat = range
1487                .compat_batch()
1488                .map(|compat| {
1489                    compat.as_flat().context(UnexpectedSnafu {
1490                        reason: "Invalid compat for flat format",
1491                    })
1492                })
1493                .transpose()?;
1494
1495            let mapper = range.compaction_projection_mapper();
1496            while let Some(record_batch) = reader.next_batch()? {
1497                let record_batch = if let Some(mapper) = mapper {
1498                    let batch = mapper.project(record_batch)?;
1499                    batch
1500                } else {
1501                    record_batch
1502                };
1503
1504                if let Some(flat_compat) = may_compat {
1505                    let batch = flat_compat.compat(record_batch)?;
1506                    yield batch;
1507                } else {
1508                    yield record_batch;
1509                }
1510            }
1511
1512            let prune_metrics = reader.metrics();
1513
1514            // Update per-file metrics if tracking is enabled
1515            if let Some(file_metrics_map) = per_file_metrics.as_mut() {
1516                let file_id = range.file_handle().file_id();
1517                let file_metrics = file_metrics_map
1518                    .entry(file_id)
1519                    .or_insert_with(FileScanMetrics::default);
1520
1521                file_metrics.num_ranges += 1;
1522                file_metrics.num_rows += prune_metrics.num_rows;
1523                file_metrics.build_reader_cost += build_cost;
1524                file_metrics.scan_cost += prune_metrics.scan_cost;
1525            }
1526
1527            reader_metrics.merge_from(&prune_metrics);
1528        }
1529
1530        // Reports metrics.
1531        reader_metrics.observe_rows(read_type);
1532        reader_metrics.filter_metrics.observe();
1533        part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref());
1534    }
1535}
1536
1537/// Build the stream of scanning the extension range denoted by the [`RowGroupIndex`].
1538#[cfg(feature = "enterprise")]
1539pub(crate) async fn scan_extension_range(
1540    context: Arc<StreamContext>,
1541    index: RowGroupIndex,
1542    partition_metrics: PartitionMetrics,
1543) -> Result<BoxedBatchStream> {
1544    use snafu::ResultExt;
1545
1546    let range = context.input.extension_range(index.index);
1547    let reader = range.reader(context.as_ref());
1548    let stream = reader
1549        .read(context, partition_metrics, index)
1550        .await
1551        .context(crate::error::ScanExternalRangeSnafu)?;
1552    Ok(stream)
1553}
1554
1555pub(crate) async fn maybe_scan_other_ranges(
1556    context: &Arc<StreamContext>,
1557    index: RowGroupIndex,
1558    metrics: &PartitionMetrics,
1559) -> Result<BoxedBatchStream> {
1560    #[cfg(feature = "enterprise")]
1561    {
1562        scan_extension_range(context.clone(), index, metrics.clone()).await
1563    }
1564
1565    #[cfg(not(feature = "enterprise"))]
1566    {
1567        let _ = context;
1568        let _ = index;
1569        let _ = metrics;
1570
1571        crate::error::UnexpectedSnafu {
1572            reason: "no other ranges scannable",
1573        }
1574        .fail()
1575    }
1576}
1577
1578/// Build the stream of scanning the extension range in flat format denoted by the [`RowGroupIndex`].
1579#[cfg(feature = "enterprise")]
1580pub(crate) async fn scan_flat_extension_range(
1581    context: Arc<StreamContext>,
1582    index: RowGroupIndex,
1583    partition_metrics: PartitionMetrics,
1584) -> Result<BoxedRecordBatchStream> {
1585    use snafu::ResultExt;
1586
1587    let range = context.input.extension_range(index.index);
1588    let reader = range.flat_reader(context.as_ref());
1589    let stream = reader
1590        .read(context, partition_metrics, index)
1591        .await
1592        .context(crate::error::ScanExternalRangeSnafu)?;
1593    Ok(stream)
1594}
1595
1596pub(crate) async fn maybe_scan_flat_other_ranges(
1597    context: &Arc<StreamContext>,
1598    index: RowGroupIndex,
1599    metrics: &PartitionMetrics,
1600) -> Result<BoxedRecordBatchStream> {
1601    #[cfg(feature = "enterprise")]
1602    {
1603        scan_flat_extension_range(context.clone(), index, metrics.clone()).await
1604    }
1605
1606    #[cfg(not(feature = "enterprise"))]
1607    {
1608        let _ = context;
1609        let _ = index;
1610        let _ = metrics;
1611
1612        crate::error::UnexpectedSnafu {
1613            reason: "no other ranges scannable in flat format",
1614        }
1615        .fail()
1616    }
1617}
1618
1619/// A stream wrapper that splits record batches from an inner stream.
1620pub(crate) struct SplitRecordBatchStream<S> {
1621    /// The inner stream that yields record batches.
1622    inner: S,
1623    /// Buffer for split batches.
1624    batches: VecDeque<RecordBatch>,
1625}
1626
1627impl<S> SplitRecordBatchStream<S> {
1628    /// Creates a new splitting stream wrapper.
1629    pub(crate) fn new(inner: S) -> Self {
1630        Self {
1631            inner,
1632            batches: VecDeque::new(),
1633        }
1634    }
1635}
1636
1637impl<S> Stream for SplitRecordBatchStream<S>
1638where
1639    S: Stream<Item = Result<RecordBatch>> + Unpin,
1640{
1641    type Item = Result<RecordBatch>;
1642
1643    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
1644        loop {
1645            // First, check if we have buffered split batches
1646            if let Some(batch) = self.batches.pop_front() {
1647                return Poll::Ready(Some(Ok(batch)));
1648            }
1649
1650            // Poll the inner stream for the next batch
1651            let record_batch = match futures::ready!(Pin::new(&mut self.inner).poll_next(cx)) {
1652                Some(Ok(batch)) => batch,
1653                Some(Err(e)) => return Poll::Ready(Some(Err(e))),
1654                None => return Poll::Ready(None),
1655            };
1656
1657            // Split the batch and buffer the results
1658            split_record_batch(record_batch, &mut self.batches);
1659            // Continue the loop to return the first split batch
1660        }
1661    }
1662}
1663
1664/// Splits the batch by timestamps.
1665///
1666/// # Panics
1667/// Panics if the timestamp array is invalid.
1668pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeque<RecordBatch>) {
1669    let batch_rows = record_batch.num_rows();
1670    if batch_rows == 0 {
1671        return;
1672    }
1673    if batch_rows < 2 {
1674        batches.push_back(record_batch);
1675        return;
1676    }
1677
1678    let time_index_pos = time_index_column_index(record_batch.num_columns());
1679    let timestamps = record_batch.column(time_index_pos);
1680    let (ts_values, _unit) = timestamp_array_to_primitive(timestamps).unwrap();
1681    let mut offsets = Vec::with_capacity(16);
1682    offsets.push(0);
1683    let values = ts_values.values();
1684    for (i, &value) in values.iter().take(batch_rows - 1).enumerate() {
1685        if value > values[i + 1] {
1686            offsets.push(i + 1);
1687        }
1688    }
1689    offsets.push(values.len());
1690
1691    // Splits the batch by offsets.
1692    for (i, &start) in offsets[..offsets.len() - 1].iter().enumerate() {
1693        let end = offsets[i + 1];
1694        let rows_in_batch = end - start;
1695        batches.push_back(record_batch.slice(start, rows_in_batch));
1696    }
1697}