Skip to main content

query/
metrics.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::BTreeMap;
16use std::pin::Pin;
17use std::sync::Arc;
18use std::task::{Context, Poll};
19
20use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
21use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
22use common_telemetry::warn;
23use datafusion::physical_plan::ExecutionPlan;
24use datatypes::schema::SchemaRef;
25use futures::Stream;
26use futures_util::ready;
27use lazy_static::lazy_static;
28use prometheus::*;
29use session::context::QueryContextRef;
30
31use crate::dist_plan::MergeScanExec;
32use crate::error::Result;
33use crate::options::FlowQueryExtensions;
34
35/// Intermediate merge state for one participating region while collecting
36/// terminal correctness watermarks across merge-scan sub-stages.
37enum MergeState {
38    /// The region participated, but no explicit watermark result has been seen
39    /// yet for this merge.
40    Participated,
41    /// At least one branch reported that this region cannot prove a safe
42    /// checkpoint watermark for the current query round.
43    Unproved,
44    /// All seen branches agree the region can advance safely to this sequence.
45    Proved(u64),
46    /// Different proved sequences were reported for the same region. The final
47    /// result is degraded to `None`, and the collected values are logged.
48    Conflict {
49        /// Distinct proved watermark candidates reported for the region.
50        watermarks: Vec<u64>,
51    },
52}
53
54lazy_static! {
55    /// Timer of different stages in query.
56    pub static ref QUERY_STAGE_ELAPSED: HistogramVec = register_histogram_vec!(
57        "greptime_query_stage_elapsed",
58        "query engine time elapsed during each stage",
59        &["stage"],
60        vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
61    )
62    .unwrap();
63    pub static ref PARSE_SQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
64        .with_label_values(&["parse_sql"]);
65    pub static ref PARSE_PROMQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
66        .with_label_values(&["parse_promql"]);
67    pub static ref OPTIMIZE_LOGICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
68        .with_label_values(&["optimize_logicalplan"]);
69    pub static ref OPTIMIZE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
70        .with_label_values(&["optimize_physicalplan"]);
71    pub static ref CREATE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
72        .with_label_values(&["create_physicalplan"]);
73    pub static ref EXEC_PLAN_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
74        .with_label_values(&["execute_plan"]);
75    pub static ref MERGE_SCAN_POLL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
76        .with_label_values(&["merge_scan_poll"]);
77
78    pub static ref MERGE_SCAN_REGIONS: Histogram = register_histogram!(
79        "greptime_query_merge_scan_regions",
80        "query merge scan regions"
81    )
82    .unwrap();
83    pub static ref MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!(
84        "greptime_query_merge_scan_errors_total",
85        "query merge scan errors total"
86    )
87    .unwrap();
88    pub static ref PUSH_DOWN_FALLBACK_ERRORS_TOTAL: IntCounter = register_int_counter!(
89        "greptime_push_down_fallback_errors_total",
90        "query push down fallback errors total"
91    )
92    .unwrap();
93
94    pub static ref QUERY_MEMORY_POOL_USAGE_BYTES: IntGauge = register_int_gauge!(
95        "greptime_query_memory_pool_usage_bytes",
96        "current query memory pool usage in bytes"
97    )
98    .unwrap();
99
100    pub static ref QUERY_MEMORY_POOL_REJECTED_TOTAL: IntCounter = register_int_counter!(
101        "greptime_query_memory_pool_rejected_total",
102        "total number of query memory allocations rejected"
103    )
104    .unwrap();
105}
106
107/// A stream to call the callback once a RecordBatch stream is done.
108pub struct OnDone<F> {
109    stream: SendableRecordBatchStream,
110    callback: Option<F>,
111}
112
113impl<F> OnDone<F> {
114    /// Attaches a `callback` to invoke once the `stream` is terminated.
115    pub fn new(stream: SendableRecordBatchStream, callback: F) -> Self {
116        Self {
117            stream,
118            callback: Some(callback),
119        }
120    }
121}
122
123impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
124    fn name(&self) -> &str {
125        self.stream.name()
126    }
127
128    fn schema(&self) -> SchemaRef {
129        self.stream.schema()
130    }
131
132    fn output_ordering(&self) -> Option<&[OrderOption]> {
133        self.stream.output_ordering()
134    }
135
136    fn metrics(&self) -> Option<RecordBatchMetrics> {
137        self.stream.metrics()
138    }
139}
140
141impl<F: FnOnce() + Unpin> Stream for OnDone<F> {
142    type Item = common_recordbatch::error::Result<RecordBatch>;
143
144    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
145        match ready!(Pin::new(&mut self.stream).poll_next(cx)) {
146            Some(rb) => Poll::Ready(Some(rb)),
147            None => {
148                if let Some(callback) = self.callback.take() {
149                    callback();
150                }
151                Poll::Ready(None)
152            }
153        }
154    }
155
156    fn size_hint(&self) -> (usize, Option<usize>) {
157        self.stream.size_hint()
158    }
159}
160
161pub struct RegionWatermarkMetricsStream {
162    stream: SendableRecordBatchStream,
163    plan: Arc<dyn ExecutionPlan>,
164}
165
166impl RegionWatermarkMetricsStream {
167    pub fn new(stream: SendableRecordBatchStream, plan: Arc<dyn ExecutionPlan>) -> Self {
168        Self { stream, plan }
169    }
170}
171
172impl RecordBatchStream for RegionWatermarkMetricsStream {
173    fn name(&self) -> &str {
174        self.stream.name()
175    }
176
177    fn schema(&self) -> SchemaRef {
178        self.stream.schema()
179    }
180
181    fn output_ordering(&self) -> Option<&[OrderOption]> {
182        self.stream.output_ordering()
183    }
184
185    fn metrics(&self) -> Option<RecordBatchMetrics> {
186        let mut metrics = self.stream.metrics()?;
187        let region_watermarks = collect_region_watermarks(self.plan.clone());
188        if !region_watermarks.is_empty() {
189            metrics.region_watermarks = region_watermarks;
190        }
191        Some(metrics)
192    }
193}
194
195impl Stream for RegionWatermarkMetricsStream {
196    type Item = common_recordbatch::error::Result<RecordBatch>;
197
198    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
199        Pin::new(&mut self.stream).poll_next(cx)
200    }
201
202    fn size_hint(&self) -> (usize, Option<usize>) {
203        self.stream.size_hint()
204    }
205}
206
207/// Returns whether terminal region watermark metrics should be collected for the query context.
208pub fn should_collect_region_watermark_from_query_ctx(query_ctx: &QueryContextRef) -> Result<bool> {
209    Ok(
210        FlowQueryExtensions::parse_flow_extensions(&query_ctx.extensions())?
211            .is_some_and(|extensions| extensions.should_collect_region_watermark()),
212    )
213}
214
215/// Attaches terminal region watermark metrics to `stream` when collection is requested.
216pub fn maybe_attach_region_watermark_metrics(
217    stream: SendableRecordBatchStream,
218    plan: Arc<dyn ExecutionPlan>,
219    should_collect_region_watermark: bool,
220) -> SendableRecordBatchStream {
221    if should_collect_region_watermark {
222        Box::pin(RegionWatermarkMetricsStream::new(stream, plan))
223    } else {
224        stream
225    }
226}
227
228pub fn terminal_recordbatch_metrics_from_plan(
229    plan: Arc<dyn ExecutionPlan>,
230) -> Option<RecordBatchMetrics> {
231    let region_watermarks = collect_region_watermarks(plan);
232    if region_watermarks.is_empty() {
233        None
234    } else {
235        Some(RecordBatchMetrics {
236            region_watermarks,
237            ..Default::default()
238        })
239    }
240}
241
242/// Collects terminal record-batch metrics from `plan` only when requested.
243pub fn terminal_recordbatch_metrics_from_plan_if_requested(
244    plan: Option<Arc<dyn ExecutionPlan>>,
245    should_collect_region_watermark: bool,
246) -> Option<RecordBatchMetrics> {
247    if should_collect_region_watermark {
248        plan.and_then(terminal_recordbatch_metrics_from_plan)
249    } else {
250        None
251    }
252}
253
254fn collect_region_watermarks(plan: Arc<dyn ExecutionPlan>) -> Vec<RegionWatermarkEntry> {
255    let mut merged = BTreeMap::<u64, MergeState>::new();
256    let mut stack = vec![plan];
257
258    while let Some(plan) = stack.pop() {
259        if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>() {
260            merge_merge_scan_region_watermarks(
261                &mut merged,
262                merge_scan
263                    .regions()
264                    .iter()
265                    .map(|region_id| region_id.as_u64()),
266                merge_scan.sub_stage_metrics(),
267            );
268        }
269        stack.extend(plan.children().into_iter().cloned());
270    }
271
272    finalize_region_watermarks(merged)
273}
274
275/// Merge a batch of per-region watermark entries into the global merged state.
276///
277/// # Merge strategy: correctness over maximum
278///
279/// Flow checkpoint advancement requires provable watermarks so that incremental
280/// queries never miss rows. This merge uses correctness-first semantics:
281///
282/// | Current state  | New entry       | Result            | Rationale |
283/// |---------------|-----------------|-------------------|-----------|
284/// | Participated  | Proved(seq)     | Proved(seq)       | First proof for the region |
285/// | Participated  | Unproved         | Unproved          | One branch cannot prove → region is unsafe |
286/// | Proved(old)   | Proved(same)    | Proved(old)       | Convergent proof, keep |
287/// | Proved(old)   | Proved(diff)    | Conflict([old,diff]) | Ambiguous → degrade to unproved |
288/// | Unproved      | _anything_      | Unproved          | Already unsafe, stays unsafe |
289/// | Conflict{..}  | Proved(seq)     | Conflict[...seq]  | Record for diagnostics |
290///
291/// Using `max(old, new)` would be incorrect because it could advance a
292/// checkpoint past rows that a competing MergeScan sub-stage has not yet
293/// scanned, causing Flow to skip data.
294fn merge_region_watermark_entries(
295    merged: &mut BTreeMap<u64, MergeState>,
296    entries: impl IntoIterator<Item = RegionWatermarkEntry>,
297) {
298    for entry in entries {
299        merged
300            .entry(entry.region_id)
301            .and_modify(|existing| match entry.watermark {
302                None => match existing {
303                    MergeState::Participated | MergeState::Proved(_) => {
304                        *existing = MergeState::Unproved;
305                    }
306                    MergeState::Unproved | MergeState::Conflict { .. } => {}
307                },
308                Some(seq) => match existing {
309                    MergeState::Participated => {
310                        *existing = MergeState::Proved(seq);
311                    }
312                    MergeState::Unproved => {}
313                    MergeState::Proved(existing_seq) if *existing_seq == seq => {}
314                    MergeState::Proved(existing_seq) => {
315                        let old_seq = *existing_seq;
316                        *existing = MergeState::Conflict {
317                            watermarks: vec![old_seq, seq],
318                        };
319                    }
320                    MergeState::Conflict { watermarks } => {
321                        if !watermarks.contains(&seq) {
322                            watermarks.push(seq);
323                        }
324                    }
325                },
326            })
327            .or_insert(match entry.watermark {
328                Some(seq) => MergeState::Proved(seq),
329                None => MergeState::Unproved,
330            });
331    }
332}
333
334fn merge_merge_scan_region_watermarks(
335    merged: &mut BTreeMap<u64, MergeState>,
336    regions: impl IntoIterator<Item = u64>,
337    sub_stage_metrics: impl IntoIterator<Item = RecordBatchMetrics>,
338) {
339    // Regions listed by MergeScanExec participated even when no sub-stage can
340    // prove a watermark. Keep them as explicit `None` entries so callers can
341    // distinguish unproved participation from non-participation.
342    for region_id in regions {
343        merged.entry(region_id).or_insert(MergeState::Participated);
344    }
345
346    for metrics in sub_stage_metrics {
347        merge_region_watermark_entries(merged, metrics.region_watermarks);
348    }
349}
350
351fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWatermarkEntry> {
352    merged
353        .into_iter()
354        .map(|(region_id, state)| RegionWatermarkEntry {
355            region_id,
356            watermark: match state {
357                MergeState::Participated => None,
358                MergeState::Unproved => None,
359                MergeState::Proved(seq) => Some(seq),
360                MergeState::Conflict { watermarks } => {
361                    warn!(
362                        "Conflicting proved watermarks for region {}: {:?}; degrading to unproved",
363                        region_id, watermarks
364                    );
365                    None
366                }
367            },
368        })
369        .collect()
370}
371
372#[cfg(test)]
373mod tests {
374    use datafusion::arrow::datatypes::Schema as ArrowSchema;
375    use datafusion::physical_plan::empty::EmptyExec;
376
377    use super::*;
378
379    fn metrics_with_region_watermarks(entries: &[(u64, Option<u64>)]) -> RecordBatchMetrics {
380        RecordBatchMetrics {
381            region_watermarks: entries
382                .iter()
383                .map(|(region_id, watermark)| RegionWatermarkEntry {
384                    region_id: *region_id,
385                    watermark: *watermark,
386                })
387                .collect(),
388            ..Default::default()
389        }
390    }
391
392    #[test]
393    fn terminal_metrics_returns_none_without_merge_scan() {
394        let plan: Arc<dyn ExecutionPlan> = Arc::new(EmptyExec::new(Arc::new(ArrowSchema::empty())));
395        assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
396    }
397
398    #[test]
399    fn merge_merge_scan_region_watermarks_marks_missing_watermarks_unproved() {
400        let mut merged = BTreeMap::new();
401
402        merge_merge_scan_region_watermarks(&mut merged, [1, 2], std::iter::empty());
403
404        assert_eq!(
405            finalize_region_watermarks(merged),
406            vec![
407                RegionWatermarkEntry {
408                    region_id: 1,
409                    watermark: None,
410                },
411                RegionWatermarkEntry {
412                    region_id: 2,
413                    watermark: None,
414                },
415            ]
416        );
417    }
418
419    #[test]
420    fn merge_merge_scan_region_watermarks_keeps_matching_proved_values() {
421        let mut merged = BTreeMap::new();
422
423        merge_merge_scan_region_watermarks(
424            &mut merged,
425            [42],
426            [
427                metrics_with_region_watermarks(&[(42, Some(7))]),
428                metrics_with_region_watermarks(&[(42, Some(7))]),
429            ],
430        );
431
432        assert_eq!(
433            finalize_region_watermarks(merged),
434            vec![RegionWatermarkEntry {
435                region_id: 42,
436                watermark: Some(7),
437            }]
438        );
439    }
440
441    #[test]
442    fn merge_merge_scan_region_watermarks_degrades_conflicting_proved_values() {
443        let mut merged = BTreeMap::new();
444
445        merge_merge_scan_region_watermarks(
446            &mut merged,
447            [7],
448            [
449                metrics_with_region_watermarks(&[(7, Some(11))]),
450                metrics_with_region_watermarks(&[(7, Some(13))]),
451            ],
452        );
453
454        assert_eq!(
455            finalize_region_watermarks(merged),
456            vec![RegionWatermarkEntry {
457                region_id: 7,
458                watermark: None,
459            }]
460        );
461    }
462
463    #[test]
464    fn merge_merge_scan_region_watermarks_none_vetoes_proved_value() {
465        let mut merged = BTreeMap::new();
466
467        merge_merge_scan_region_watermarks(
468            &mut merged,
469            [9],
470            [
471                metrics_with_region_watermarks(&[(9, Some(21))]),
472                metrics_with_region_watermarks(&[(9, None)]),
473            ],
474        );
475
476        assert_eq!(
477            finalize_region_watermarks(merged),
478            vec![RegionWatermarkEntry {
479                region_id: 9,
480                watermark: None,
481            }]
482        );
483    }
484
485    #[test]
486    fn merge_merge_scan_region_watermarks_none_vetoes_proved_value_regardless_of_order() {
487        let mut merged = BTreeMap::new();
488
489        merge_merge_scan_region_watermarks(
490            &mut merged,
491            [9],
492            [
493                metrics_with_region_watermarks(&[(9, None)]),
494                metrics_with_region_watermarks(&[(9, Some(21))]),
495            ],
496        );
497
498        assert_eq!(
499            finalize_region_watermarks(merged),
500            vec![RegionWatermarkEntry {
501                region_id: 9,
502                watermark: None,
503            }]
504        );
505    }
506}