Skip to main content

query/
metrics.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::{BTreeMap, BTreeSet};
16use std::pin::Pin;
17use std::sync::Arc;
18use std::task::{Context, Poll};
19
20use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
21use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
22use common_telemetry::warn;
23use datafusion::physical_plan::ExecutionPlan;
24use datatypes::schema::SchemaRef;
25use futures::Stream;
26use futures_util::ready;
27use lazy_static::lazy_static;
28use prometheus::*;
29use session::context::QueryContextRef;
30
31use crate::dist_plan::MergeScanExec;
32use crate::error::Result;
33use crate::options::FlowQueryExtensions;
34
35/// Intermediate merge state for one participating region while collecting
36/// terminal correctness watermarks across merge-scan sub-stages.
37enum MergeState {
38    /// At least one branch reported that this region cannot prove a safe
39    /// checkpoint watermark for the current query round.
40    Unproved,
41    /// All seen branches agree the region can advance safely to this sequence.
42    Proved(u64),
43    /// Different proved sequences were reported for the same region. The final
44    /// result is degraded to `None`, and the collected values are logged.
45    Conflict {
46        /// Distinct proved watermark candidates reported for the region.
47        watermarks: Vec<u64>,
48    },
49}
50
51lazy_static! {
52    /// Timer of different stages in query.
53    pub static ref QUERY_STAGE_ELAPSED: HistogramVec = register_histogram_vec!(
54        "greptime_query_stage_elapsed",
55        "query engine time elapsed during each stage",
56        &["stage"],
57        vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
58    )
59    .unwrap();
60    pub static ref PARSE_SQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
61        .with_label_values(&["parse_sql"]);
62    pub static ref PARSE_PROMQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
63        .with_label_values(&["parse_promql"]);
64    pub static ref OPTIMIZE_LOGICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
65        .with_label_values(&["optimize_logicalplan"]);
66    pub static ref OPTIMIZE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
67        .with_label_values(&["optimize_physicalplan"]);
68    pub static ref CREATE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
69        .with_label_values(&["create_physicalplan"]);
70    pub static ref EXEC_PLAN_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
71        .with_label_values(&["execute_plan"]);
72    pub static ref MERGE_SCAN_POLL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
73        .with_label_values(&["merge_scan_poll"]);
74
75    pub static ref MERGE_SCAN_REGIONS: Histogram = register_histogram!(
76        "greptime_query_merge_scan_regions",
77        "query merge scan regions"
78    )
79    .unwrap();
80    pub static ref MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!(
81        "greptime_query_merge_scan_errors_total",
82        "query merge scan errors total"
83    )
84    .unwrap();
85    pub static ref PUSH_DOWN_FALLBACK_ERRORS_TOTAL: IntCounter = register_int_counter!(
86        "greptime_push_down_fallback_errors_total",
87        "query push down fallback errors total"
88    )
89    .unwrap();
90
91    pub static ref QUERY_MEMORY_POOL_USAGE_BYTES: IntGauge = register_int_gauge!(
92        "greptime_query_memory_pool_usage_bytes",
93        "current query memory pool usage in bytes"
94    )
95    .unwrap();
96
97    pub static ref QUERY_MEMORY_POOL_REJECTED_TOTAL: IntCounter = register_int_counter!(
98        "greptime_query_memory_pool_rejected_total",
99        "total number of query memory allocations rejected"
100    )
101    .unwrap();
102
103    /// Remote dynamic filter fanout RPC results, labeled with status.
104    pub static ref REMOTE_DYN_FILTER_UPDATE_RPC_TOTAL: IntCounterVec = register_int_counter_vec!(
105        "greptime_query_remote_dyn_filter_update_rpc_total",
106        "remote dynamic filter fanout RPC results",
107        &["status"]
108    )
109    .unwrap();
110
111    /// Remote dynamic filter fanout payload bytes.
112    pub static ref REMOTE_DYN_FILTER_PAYLOAD_BYTES: Histogram = register_histogram!(
113        "greptime_query_remote_dyn_filter_payload_bytes",
114        "remote dynamic filter fanout payload bytes",
115        vec![
116            128.0, 256.0, 512.0, 1024.0, 2048.0, 4096.0, 8192.0, 16384.0, 32768.0,
117            65536.0, 131072.0, 262144.0, 524288.0,
118        ]
119    )
120    .unwrap();
121
122    /// Remote dynamic filter encode results, labeled with result.
123    pub static ref REMOTE_DYN_FILTER_ENCODE_TOTAL: IntCounterVec = register_int_counter_vec!(
124        "greptime_query_remote_dyn_filter_encode_total",
125        "remote dynamic filter encode results",
126        &["result"]
127    )
128    .unwrap();
129}
130
131/// A stream to call the callback once a RecordBatch stream is done.
132pub struct OnDone<F> {
133    stream: SendableRecordBatchStream,
134    callback: Option<F>,
135}
136
137impl<F> OnDone<F> {
138    /// Attaches a `callback` to invoke once the `stream` is terminated.
139    pub fn new(stream: SendableRecordBatchStream, callback: F) -> Self {
140        Self {
141            stream,
142            callback: Some(callback),
143        }
144    }
145}
146
147impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
148    fn name(&self) -> &str {
149        self.stream.name()
150    }
151
152    fn schema(&self) -> SchemaRef {
153        self.stream.schema()
154    }
155
156    fn output_ordering(&self) -> Option<&[OrderOption]> {
157        self.stream.output_ordering()
158    }
159
160    fn metrics(&self) -> Option<RecordBatchMetrics> {
161        self.stream.metrics()
162    }
163}
164
165impl<F: FnOnce() + Unpin> Stream for OnDone<F> {
166    type Item = common_recordbatch::error::Result<RecordBatch>;
167
168    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
169        match ready!(Pin::new(&mut self.stream).poll_next(cx)) {
170            Some(rb) => Poll::Ready(Some(rb)),
171            None => {
172                if let Some(callback) = self.callback.take() {
173                    callback();
174                }
175                Poll::Ready(None)
176            }
177        }
178    }
179
180    fn size_hint(&self) -> (usize, Option<usize>) {
181        self.stream.size_hint()
182    }
183}
184
185pub struct RegionWatermarkMetricsStream {
186    stream: SendableRecordBatchStream,
187    plan: Arc<dyn ExecutionPlan>,
188}
189
190impl RegionWatermarkMetricsStream {
191    pub fn new(stream: SendableRecordBatchStream, plan: Arc<dyn ExecutionPlan>) -> Self {
192        Self { stream, plan }
193    }
194}
195
196impl RecordBatchStream for RegionWatermarkMetricsStream {
197    fn name(&self) -> &str {
198        self.stream.name()
199    }
200
201    fn schema(&self) -> SchemaRef {
202        self.stream.schema()
203    }
204
205    fn output_ordering(&self) -> Option<&[OrderOption]> {
206        self.stream.output_ordering()
207    }
208
209    fn metrics(&self) -> Option<RecordBatchMetrics> {
210        let mut metrics = self.stream.metrics()?;
211        let region_watermarks = collect_region_watermarks(self.plan.clone());
212        if !region_watermarks.is_empty() {
213            metrics.region_watermarks = region_watermarks;
214        }
215        Some(metrics)
216    }
217}
218
219impl Stream for RegionWatermarkMetricsStream {
220    type Item = common_recordbatch::error::Result<RecordBatch>;
221
222    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
223        Pin::new(&mut self.stream).poll_next(cx)
224    }
225
226    fn size_hint(&self) -> (usize, Option<usize>) {
227        self.stream.size_hint()
228    }
229}
230
231/// Returns whether terminal region watermark metrics should be collected for the query context.
232pub fn should_collect_region_watermark_from_query_ctx(query_ctx: &QueryContextRef) -> Result<bool> {
233    Ok(
234        FlowQueryExtensions::parse_flow_extensions(&query_ctx.extensions())?
235            .is_some_and(|extensions| extensions.should_collect_region_watermark()),
236    )
237}
238
239/// Attaches terminal region watermark metrics to `stream` when collection is requested.
240pub fn maybe_attach_region_watermark_metrics(
241    stream: SendableRecordBatchStream,
242    plan: Arc<dyn ExecutionPlan>,
243    should_collect_region_watermark: bool,
244) -> SendableRecordBatchStream {
245    if should_collect_region_watermark {
246        Box::pin(RegionWatermarkMetricsStream::new(stream, plan))
247    } else {
248        stream
249    }
250}
251
252pub fn terminal_recordbatch_metrics_from_plan(
253    plan: Arc<dyn ExecutionPlan>,
254) -> Option<RecordBatchMetrics> {
255    let region_watermarks = collect_region_watermarks(plan);
256    if region_watermarks.is_empty() {
257        None
258    } else {
259        Some(RecordBatchMetrics {
260            region_watermarks,
261            ..Default::default()
262        })
263    }
264}
265
266/// Collects terminal record-batch metrics from `plan` only when requested.
267pub fn terminal_recordbatch_metrics_from_plan_if_requested(
268    plan: Option<Arc<dyn ExecutionPlan>>,
269    should_collect_region_watermark: bool,
270) -> Option<RecordBatchMetrics> {
271    if should_collect_region_watermark {
272        plan.and_then(terminal_recordbatch_metrics_from_plan)
273    } else {
274        None
275    }
276}
277
278fn collect_region_watermarks(plan: Arc<dyn ExecutionPlan>) -> Vec<RegionWatermarkEntry> {
279    let mut merged = BTreeMap::<u64, MergeState>::new();
280    let mut stack = vec![plan];
281
282    while let Some(plan) = stack.pop() {
283        if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>()
284            && !merge_scan.is_flow_sink_scan()
285        {
286            merge_merge_scan_region_watermarks(
287                &mut merged,
288                merge_scan
289                    .regions()
290                    .iter()
291                    .map(|region_id| region_id.as_u64()),
292                merge_scan.sub_stage_metrics(),
293            );
294        }
295        stack.extend(plan.children().into_iter().cloned());
296    }
297
298    finalize_region_watermarks(merged)
299}
300
301/// Merge a batch of per-region watermark entries into the global merged state.
302///
303/// # Merge strategy: correctness over maximum
304///
305/// Flow checkpoint advancement requires provable watermarks so that incremental
306/// queries never miss rows. This merge uses correctness-first semantics:
307///
308/// | Current state  | New entry       | Result            | Rationale |
309/// |---------------|-----------------|-------------------|-----------|
310/// | Proved(old)   | Proved(same)    | Proved(old)       | Convergent proof, keep |
311/// | Proved(old)   | Proved(diff)    | Conflict([old,diff]) | Ambiguous → degrade to unproved |
312/// | Unproved      | _anything_      | Unproved          | Already unsafe, stays unsafe |
313/// | Conflict{..}  | Proved(seq)     | Conflict[...seq]  | Record for diagnostics |
314///
315/// Using `max(old, new)` would be incorrect because it could advance a
316/// checkpoint past rows that a competing MergeScan sub-stage has not yet
317/// scanned, causing Flow to skip data.
318fn merge_region_watermark_entries(
319    merged: &mut BTreeMap<u64, MergeState>,
320    entries: impl IntoIterator<Item = RegionWatermarkEntry>,
321) {
322    for entry in entries {
323        merged
324            .entry(entry.region_id)
325            .and_modify(|existing| match entry.watermark {
326                None => match existing {
327                    MergeState::Proved(_) => {
328                        *existing = MergeState::Unproved;
329                    }
330                    MergeState::Unproved | MergeState::Conflict { .. } => {}
331                },
332                Some(seq) => match existing {
333                    MergeState::Unproved => {}
334                    MergeState::Proved(existing_seq) if *existing_seq == seq => {}
335                    MergeState::Proved(existing_seq) => {
336                        let old_seq = *existing_seq;
337                        *existing = MergeState::Conflict {
338                            watermarks: vec![old_seq, seq],
339                        };
340                    }
341                    MergeState::Conflict { watermarks } => {
342                        if !watermarks.contains(&seq) {
343                            watermarks.push(seq);
344                        }
345                    }
346                },
347            })
348            .or_insert(match entry.watermark {
349                Some(seq) => MergeState::Proved(seq),
350                None => MergeState::Unproved,
351            });
352    }
353}
354
355fn merge_merge_scan_region_watermarks(
356    merged: &mut BTreeMap<u64, MergeState>,
357    regions: impl IntoIterator<Item = u64>,
358    sub_stage_metrics: impl IntoIterator<Item = RecordBatchMetrics>,
359) {
360    let regions = regions.into_iter().collect::<Vec<_>>();
361    let mut proved_or_unproved_regions = BTreeSet::new();
362    for metrics in sub_stage_metrics {
363        proved_or_unproved_regions.extend(
364            metrics
365                .region_watermarks
366                .iter()
367                .map(|entry| entry.region_id),
368        );
369        merge_region_watermark_entries(merged, metrics.region_watermarks);
370    }
371
372    // Regions listed by a MergeScanExec participated even when no sub-stage can
373    // prove a watermark. Merge missing per-scan region entries as explicit
374    // `None` entries so an unproved participating branch vetoes any proof from
375    // another branch for the same region.
376    merge_region_watermark_entries(
377        merged,
378        regions
379            .into_iter()
380            .filter(|region_id| !proved_or_unproved_regions.contains(region_id))
381            .map(|region_id| RegionWatermarkEntry {
382                region_id,
383                watermark: None,
384            }),
385    );
386}
387
388fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWatermarkEntry> {
389    merged
390        .into_iter()
391        .map(|(region_id, state)| RegionWatermarkEntry {
392            region_id,
393            watermark: match state {
394                MergeState::Unproved => None,
395                MergeState::Proved(seq) => Some(seq),
396                MergeState::Conflict { watermarks } => {
397                    warn!(
398                        "Conflicting proved watermarks for region {}: {:?}; degrading to unproved",
399                        region_id, watermarks
400                    );
401                    None
402                }
403            },
404        })
405        .collect()
406}
407
408#[cfg(test)]
409mod tests {
410    use std::collections::{BTreeMap, BTreeSet};
411    use std::sync::Arc;
412
413    use api::v1::region::{RemoteDynFilterUnregister, RemoteDynFilterUpdate};
414    use async_trait::async_trait;
415    use datafusion::arrow::datatypes::Schema as ArrowSchema;
416    use datafusion::execution::SessionStateBuilder;
417    use datafusion::physical_plan::empty::EmptyExec;
418    use datafusion_expr::LogicalPlanBuilder;
419    use session::ReadPreference;
420    use session::context::QueryContextBuilder;
421    use store_api::storage::RegionId;
422    use table::table_name::TableName;
423
424    use super::*;
425    use crate::dist_plan::RemoteDynFilterProducerId;
426    use crate::options::{FLOW_RETURN_REGION_SEQ, FLOW_SINK_TABLE_ID};
427    use crate::region_query::RegionQueryHandler;
428
429    struct NoopRegionQueryHandler;
430
431    #[async_trait]
432    impl RegionQueryHandler for NoopRegionQueryHandler {
433        async fn do_get(
434            &self,
435            _read_preference: ReadPreference,
436            _request: common_query::request::QueryRequest,
437        ) -> Result<SendableRecordBatchStream> {
438            unreachable!("metrics tests should not execute remote queries")
439        }
440
441        async fn handle_remote_dyn_filter_update(
442            &self,
443            _region_id: RegionId,
444            _query_id: String,
445            _update: RemoteDynFilterUpdate,
446        ) -> Result<()> {
447            unreachable!("metrics tests should not send remote dyn filter updates")
448        }
449
450        async fn handle_remote_dyn_filter_unregister(
451            &self,
452            _region_id: RegionId,
453            _query_id: String,
454            _unregister: RemoteDynFilterUnregister,
455        ) -> Result<()> {
456            unreachable!("metrics tests should not send remote dyn filter unregisters")
457        }
458    }
459
460    fn metrics_with_region_watermarks(entries: &[(u64, Option<u64>)]) -> RecordBatchMetrics {
461        RecordBatchMetrics {
462            region_watermarks: entries
463                .iter()
464                .map(|(region_id, watermark)| RegionWatermarkEntry {
465                    region_id: *region_id,
466                    watermark: *watermark,
467                })
468                .collect(),
469            ..Default::default()
470        }
471    }
472
473    fn test_merge_scan_exec(table_id: u32, query_ctx: QueryContextRef) -> Arc<dyn ExecutionPlan> {
474        let session_state = SessionStateBuilder::new().with_default_features().build();
475        let plan = LogicalPlanBuilder::empty(false).build().unwrap();
476        let schema = ArrowSchema::empty();
477
478        Arc::new(
479            MergeScanExec::new(
480                &session_state,
481                TableName::new("greptime", "public", "test"),
482                vec![RegionId::new(table_id, 0)],
483                plan,
484                &schema,
485                Arc::new(NoopRegionQueryHandler),
486                query_ctx,
487                1,
488                BTreeMap::<String, BTreeSet<datafusion_common::Column>>::new(),
489                Some(RemoteDynFilterProducerId::new(0)),
490                false,
491            )
492            .unwrap(),
493        )
494    }
495
496    fn flow_query_ctx_with_sink_table_id(sink_table_id: u32) -> QueryContextRef {
497        Arc::new(
498            QueryContextBuilder::default()
499                .set_extension(FLOW_RETURN_REGION_SEQ.to_string(), "true".to_string())
500                .set_extension(FLOW_SINK_TABLE_ID.to_string(), sink_table_id.to_string())
501                .build(),
502        )
503    }
504
505    #[test]
506    fn terminal_metrics_returns_none_without_merge_scan() {
507        let plan: Arc<dyn ExecutionPlan> = Arc::new(EmptyExec::new(Arc::new(ArrowSchema::empty())));
508        assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
509    }
510
511    #[test]
512    fn terminal_metrics_skip_flow_sink_merge_scan_regions() {
513        let query_ctx = flow_query_ctx_with_sink_table_id(42);
514        let plan = test_merge_scan_exec(42, query_ctx);
515
516        assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
517    }
518
519    #[test]
520    fn terminal_metrics_keep_source_merge_scan_regions_with_sink_extension() {
521        let query_ctx = flow_query_ctx_with_sink_table_id(42);
522        let plan = test_merge_scan_exec(43, query_ctx);
523
524        assert_eq!(
525            terminal_recordbatch_metrics_from_plan(plan)
526                .unwrap()
527                .region_watermarks,
528            vec![RegionWatermarkEntry {
529                region_id: RegionId::new(43, 0).as_u64(),
530                watermark: None,
531            }]
532        );
533    }
534
535    #[test]
536    fn merge_merge_scan_region_watermarks_marks_missing_watermarks_unproved() {
537        let mut merged = BTreeMap::new();
538
539        merge_merge_scan_region_watermarks(&mut merged, [1, 2], std::iter::empty());
540
541        assert_eq!(
542            finalize_region_watermarks(merged),
543            vec![
544                RegionWatermarkEntry {
545                    region_id: 1,
546                    watermark: None,
547                },
548                RegionWatermarkEntry {
549                    region_id: 2,
550                    watermark: None,
551                },
552            ]
553        );
554    }
555
556    #[test]
557    fn merge_merge_scan_region_watermarks_keeps_matching_proved_values() {
558        let mut merged = BTreeMap::new();
559
560        merge_merge_scan_region_watermarks(
561            &mut merged,
562            [42],
563            [
564                metrics_with_region_watermarks(&[(42, Some(7))]),
565                metrics_with_region_watermarks(&[(42, Some(7))]),
566            ],
567        );
568
569        assert_eq!(
570            finalize_region_watermarks(merged),
571            vec![RegionWatermarkEntry {
572                region_id: 42,
573                watermark: Some(7),
574            }]
575        );
576    }
577
578    #[test]
579    fn merge_merge_scan_region_watermarks_degrades_conflicting_proved_values() {
580        let mut merged = BTreeMap::new();
581
582        merge_merge_scan_region_watermarks(
583            &mut merged,
584            [7],
585            [
586                metrics_with_region_watermarks(&[(7, Some(11))]),
587                metrics_with_region_watermarks(&[(7, Some(13))]),
588            ],
589        );
590
591        assert_eq!(
592            finalize_region_watermarks(merged),
593            vec![RegionWatermarkEntry {
594                region_id: 7,
595                watermark: None,
596            }]
597        );
598    }
599
600    #[test]
601    fn merge_merge_scan_region_watermarks_none_vetoes_proved_value() {
602        let mut merged = BTreeMap::new();
603
604        merge_merge_scan_region_watermarks(
605            &mut merged,
606            [9],
607            [
608                metrics_with_region_watermarks(&[(9, Some(21))]),
609                metrics_with_region_watermarks(&[(9, None)]),
610            ],
611        );
612
613        assert_eq!(
614            finalize_region_watermarks(merged),
615            vec![RegionWatermarkEntry {
616                region_id: 9,
617                watermark: None,
618            }]
619        );
620    }
621
622    #[test]
623    fn merge_merge_scan_region_watermarks_none_vetoes_proved_value_regardless_of_order() {
624        let mut merged = BTreeMap::new();
625
626        merge_merge_scan_region_watermarks(
627            &mut merged,
628            [9],
629            [
630                metrics_with_region_watermarks(&[(9, None)]),
631                metrics_with_region_watermarks(&[(9, Some(21))]),
632            ],
633        );
634
635        assert_eq!(
636            finalize_region_watermarks(merged),
637            vec![RegionWatermarkEntry {
638                region_id: 9,
639                watermark: None,
640            }]
641        );
642    }
643
644    #[test]
645    fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value() {
646        let mut merged = BTreeMap::new();
647
648        merge_merge_scan_region_watermarks(
649            &mut merged,
650            [9],
651            [metrics_with_region_watermarks(&[(9, Some(21))])],
652        );
653        merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
654
655        assert_eq!(
656            finalize_region_watermarks(merged),
657            vec![RegionWatermarkEntry {
658                region_id: 9,
659                watermark: None,
660            }]
661        );
662    }
663
664    #[test]
665    fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value_regardless_of_order() {
666        let mut merged = BTreeMap::new();
667
668        merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
669        merge_merge_scan_region_watermarks(
670            &mut merged,
671            [9],
672            [metrics_with_region_watermarks(&[(9, Some(21))])],
673        );
674
675        assert_eq!(
676            finalize_region_watermarks(merged),
677            vec![RegionWatermarkEntry {
678                region_id: 9,
679                watermark: None,
680            }]
681        );
682    }
683}