1use std::collections::{BTreeMap, BTreeSet};
16use std::pin::Pin;
17use std::sync::Arc;
18use std::task::{Context, Poll};
19
20use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
21use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
22use common_telemetry::warn;
23use datafusion::physical_plan::ExecutionPlan;
24use datatypes::schema::SchemaRef;
25use futures::Stream;
26use futures_util::ready;
27use lazy_static::lazy_static;
28use prometheus::*;
29use session::context::QueryContextRef;
30
31use crate::dist_plan::MergeScanExec;
32use crate::error::Result;
33use crate::options::FlowQueryExtensions;
34
35enum MergeState {
38 Unproved,
41 Proved(u64),
43 Conflict {
46 watermarks: Vec<u64>,
48 },
49}
50
51lazy_static! {
52 pub static ref QUERY_STAGE_ELAPSED: HistogramVec = register_histogram_vec!(
54 "greptime_query_stage_elapsed",
55 "query engine time elapsed during each stage",
56 &["stage"],
57 vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
58 )
59 .unwrap();
60 pub static ref PARSE_SQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
61 .with_label_values(&["parse_sql"]);
62 pub static ref PARSE_PROMQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
63 .with_label_values(&["parse_promql"]);
64 pub static ref OPTIMIZE_LOGICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
65 .with_label_values(&["optimize_logicalplan"]);
66 pub static ref OPTIMIZE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
67 .with_label_values(&["optimize_physicalplan"]);
68 pub static ref CREATE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
69 .with_label_values(&["create_physicalplan"]);
70 pub static ref EXEC_PLAN_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
71 .with_label_values(&["execute_plan"]);
72 pub static ref MERGE_SCAN_POLL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
73 .with_label_values(&["merge_scan_poll"]);
74
75 pub static ref MERGE_SCAN_REGIONS: Histogram = register_histogram!(
76 "greptime_query_merge_scan_regions",
77 "query merge scan regions"
78 )
79 .unwrap();
80 pub static ref MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!(
81 "greptime_query_merge_scan_errors_total",
82 "query merge scan errors total"
83 )
84 .unwrap();
85 pub static ref PUSH_DOWN_FALLBACK_ERRORS_TOTAL: IntCounter = register_int_counter!(
86 "greptime_push_down_fallback_errors_total",
87 "query push down fallback errors total"
88 )
89 .unwrap();
90
91 pub static ref QUERY_MEMORY_POOL_USAGE_BYTES: IntGauge = register_int_gauge!(
92 "greptime_query_memory_pool_usage_bytes",
93 "current query memory pool usage in bytes"
94 )
95 .unwrap();
96
97 pub static ref QUERY_MEMORY_POOL_REJECTED_TOTAL: IntCounter = register_int_counter!(
98 "greptime_query_memory_pool_rejected_total",
99 "total number of query memory allocations rejected"
100 )
101 .unwrap();
102
103 pub static ref REMOTE_DYN_FILTER_UPDATE_RPC_TOTAL: IntCounterVec = register_int_counter_vec!(
105 "greptime_query_remote_dyn_filter_update_rpc_total",
106 "remote dynamic filter fanout RPC results",
107 &["status"]
108 )
109 .unwrap();
110
111 pub static ref REMOTE_DYN_FILTER_PAYLOAD_BYTES: Histogram = register_histogram!(
113 "greptime_query_remote_dyn_filter_payload_bytes",
114 "remote dynamic filter fanout payload bytes",
115 vec![
116 128.0, 256.0, 512.0, 1024.0, 2048.0, 4096.0, 8192.0, 16384.0, 32768.0,
117 65536.0, 131072.0, 262144.0, 524288.0,
118 ]
119 )
120 .unwrap();
121
122 pub static ref REMOTE_DYN_FILTER_ENCODE_TOTAL: IntCounterVec = register_int_counter_vec!(
124 "greptime_query_remote_dyn_filter_encode_total",
125 "remote dynamic filter encode results",
126 &["result"]
127 )
128 .unwrap();
129}
130
131pub struct OnDone<F> {
133 stream: SendableRecordBatchStream,
134 callback: Option<F>,
135}
136
137impl<F> OnDone<F> {
138 pub fn new(stream: SendableRecordBatchStream, callback: F) -> Self {
140 Self {
141 stream,
142 callback: Some(callback),
143 }
144 }
145}
146
147impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
148 fn name(&self) -> &str {
149 self.stream.name()
150 }
151
152 fn schema(&self) -> SchemaRef {
153 self.stream.schema()
154 }
155
156 fn output_ordering(&self) -> Option<&[OrderOption]> {
157 self.stream.output_ordering()
158 }
159
160 fn metrics(&self) -> Option<RecordBatchMetrics> {
161 self.stream.metrics()
162 }
163}
164
165impl<F: FnOnce() + Unpin> Stream for OnDone<F> {
166 type Item = common_recordbatch::error::Result<RecordBatch>;
167
168 fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
169 match ready!(Pin::new(&mut self.stream).poll_next(cx)) {
170 Some(rb) => Poll::Ready(Some(rb)),
171 None => {
172 if let Some(callback) = self.callback.take() {
173 callback();
174 }
175 Poll::Ready(None)
176 }
177 }
178 }
179
180 fn size_hint(&self) -> (usize, Option<usize>) {
181 self.stream.size_hint()
182 }
183}
184
185pub struct RegionWatermarkMetricsStream {
186 stream: SendableRecordBatchStream,
187 plan: Arc<dyn ExecutionPlan>,
188}
189
190impl RegionWatermarkMetricsStream {
191 pub fn new(stream: SendableRecordBatchStream, plan: Arc<dyn ExecutionPlan>) -> Self {
192 Self { stream, plan }
193 }
194}
195
196impl RecordBatchStream for RegionWatermarkMetricsStream {
197 fn name(&self) -> &str {
198 self.stream.name()
199 }
200
201 fn schema(&self) -> SchemaRef {
202 self.stream.schema()
203 }
204
205 fn output_ordering(&self) -> Option<&[OrderOption]> {
206 self.stream.output_ordering()
207 }
208
209 fn metrics(&self) -> Option<RecordBatchMetrics> {
210 let mut metrics = self.stream.metrics()?;
211 let region_watermarks = collect_region_watermarks(self.plan.clone());
212 if !region_watermarks.is_empty() {
213 metrics.region_watermarks = region_watermarks;
214 }
215 Some(metrics)
216 }
217}
218
219impl Stream for RegionWatermarkMetricsStream {
220 type Item = common_recordbatch::error::Result<RecordBatch>;
221
222 fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
223 Pin::new(&mut self.stream).poll_next(cx)
224 }
225
226 fn size_hint(&self) -> (usize, Option<usize>) {
227 self.stream.size_hint()
228 }
229}
230
231pub fn should_collect_region_watermark_from_query_ctx(query_ctx: &QueryContextRef) -> Result<bool> {
233 Ok(
234 FlowQueryExtensions::parse_flow_extensions(&query_ctx.extensions())?
235 .is_some_and(|extensions| extensions.should_collect_region_watermark()),
236 )
237}
238
239pub fn maybe_attach_region_watermark_metrics(
241 stream: SendableRecordBatchStream,
242 plan: Arc<dyn ExecutionPlan>,
243 should_collect_region_watermark: bool,
244) -> SendableRecordBatchStream {
245 if should_collect_region_watermark {
246 Box::pin(RegionWatermarkMetricsStream::new(stream, plan))
247 } else {
248 stream
249 }
250}
251
252pub fn terminal_recordbatch_metrics_from_plan(
253 plan: Arc<dyn ExecutionPlan>,
254) -> Option<RecordBatchMetrics> {
255 let region_watermarks = collect_region_watermarks(plan);
256 if region_watermarks.is_empty() {
257 None
258 } else {
259 Some(RecordBatchMetrics {
260 region_watermarks,
261 ..Default::default()
262 })
263 }
264}
265
266pub fn terminal_recordbatch_metrics_from_plan_if_requested(
268 plan: Option<Arc<dyn ExecutionPlan>>,
269 should_collect_region_watermark: bool,
270) -> Option<RecordBatchMetrics> {
271 if should_collect_region_watermark {
272 plan.and_then(terminal_recordbatch_metrics_from_plan)
273 } else {
274 None
275 }
276}
277
278fn collect_region_watermarks(plan: Arc<dyn ExecutionPlan>) -> Vec<RegionWatermarkEntry> {
279 let mut merged = BTreeMap::<u64, MergeState>::new();
280 let mut stack = vec![plan];
281
282 while let Some(plan) = stack.pop() {
283 if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>()
284 && !merge_scan.is_flow_sink_scan()
285 {
286 merge_merge_scan_region_watermarks(
287 &mut merged,
288 merge_scan
289 .regions()
290 .iter()
291 .map(|region_id| region_id.as_u64()),
292 merge_scan.sub_stage_metrics(),
293 );
294 }
295 stack.extend(plan.children().into_iter().cloned());
296 }
297
298 finalize_region_watermarks(merged)
299}
300
301fn merge_region_watermark_entries(
319 merged: &mut BTreeMap<u64, MergeState>,
320 entries: impl IntoIterator<Item = RegionWatermarkEntry>,
321) {
322 for entry in entries {
323 merged
324 .entry(entry.region_id)
325 .and_modify(|existing| match entry.watermark {
326 None => match existing {
327 MergeState::Proved(_) => {
328 *existing = MergeState::Unproved;
329 }
330 MergeState::Unproved | MergeState::Conflict { .. } => {}
331 },
332 Some(seq) => match existing {
333 MergeState::Unproved => {}
334 MergeState::Proved(existing_seq) if *existing_seq == seq => {}
335 MergeState::Proved(existing_seq) => {
336 let old_seq = *existing_seq;
337 *existing = MergeState::Conflict {
338 watermarks: vec![old_seq, seq],
339 };
340 }
341 MergeState::Conflict { watermarks } => {
342 if !watermarks.contains(&seq) {
343 watermarks.push(seq);
344 }
345 }
346 },
347 })
348 .or_insert(match entry.watermark {
349 Some(seq) => MergeState::Proved(seq),
350 None => MergeState::Unproved,
351 });
352 }
353}
354
355fn merge_merge_scan_region_watermarks(
356 merged: &mut BTreeMap<u64, MergeState>,
357 regions: impl IntoIterator<Item = u64>,
358 sub_stage_metrics: impl IntoIterator<Item = RecordBatchMetrics>,
359) {
360 let regions = regions.into_iter().collect::<Vec<_>>();
361 let mut proved_or_unproved_regions = BTreeSet::new();
362 for metrics in sub_stage_metrics {
363 proved_or_unproved_regions.extend(
364 metrics
365 .region_watermarks
366 .iter()
367 .map(|entry| entry.region_id),
368 );
369 merge_region_watermark_entries(merged, metrics.region_watermarks);
370 }
371
372 merge_region_watermark_entries(
377 merged,
378 regions
379 .into_iter()
380 .filter(|region_id| !proved_or_unproved_regions.contains(region_id))
381 .map(|region_id| RegionWatermarkEntry {
382 region_id,
383 watermark: None,
384 }),
385 );
386}
387
388fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWatermarkEntry> {
389 merged
390 .into_iter()
391 .map(|(region_id, state)| RegionWatermarkEntry {
392 region_id,
393 watermark: match state {
394 MergeState::Unproved => None,
395 MergeState::Proved(seq) => Some(seq),
396 MergeState::Conflict { watermarks } => {
397 warn!(
398 "Conflicting proved watermarks for region {}: {:?}; degrading to unproved",
399 region_id, watermarks
400 );
401 None
402 }
403 },
404 })
405 .collect()
406}
407
408#[cfg(test)]
409mod tests {
410 use std::collections::{BTreeMap, BTreeSet};
411 use std::sync::Arc;
412
413 use api::v1::region::{RemoteDynFilterUnregister, RemoteDynFilterUpdate};
414 use async_trait::async_trait;
415 use datafusion::arrow::datatypes::Schema as ArrowSchema;
416 use datafusion::execution::SessionStateBuilder;
417 use datafusion::physical_plan::empty::EmptyExec;
418 use datafusion_expr::LogicalPlanBuilder;
419 use session::ReadPreference;
420 use session::context::QueryContextBuilder;
421 use store_api::storage::RegionId;
422 use table::table_name::TableName;
423
424 use super::*;
425 use crate::dist_plan::RemoteDynFilterProducerId;
426 use crate::options::{FLOW_RETURN_REGION_SEQ, FLOW_SINK_TABLE_ID};
427 use crate::region_query::RegionQueryHandler;
428
429 struct NoopRegionQueryHandler;
430
431 #[async_trait]
432 impl RegionQueryHandler for NoopRegionQueryHandler {
433 async fn do_get(
434 &self,
435 _read_preference: ReadPreference,
436 _request: common_query::request::QueryRequest,
437 ) -> Result<SendableRecordBatchStream> {
438 unreachable!("metrics tests should not execute remote queries")
439 }
440
441 async fn handle_remote_dyn_filter_update(
442 &self,
443 _region_id: RegionId,
444 _query_id: String,
445 _update: RemoteDynFilterUpdate,
446 ) -> Result<()> {
447 unreachable!("metrics tests should not send remote dyn filter updates")
448 }
449
450 async fn handle_remote_dyn_filter_unregister(
451 &self,
452 _region_id: RegionId,
453 _query_id: String,
454 _unregister: RemoteDynFilterUnregister,
455 ) -> Result<()> {
456 unreachable!("metrics tests should not send remote dyn filter unregisters")
457 }
458 }
459
460 fn metrics_with_region_watermarks(entries: &[(u64, Option<u64>)]) -> RecordBatchMetrics {
461 RecordBatchMetrics {
462 region_watermarks: entries
463 .iter()
464 .map(|(region_id, watermark)| RegionWatermarkEntry {
465 region_id: *region_id,
466 watermark: *watermark,
467 })
468 .collect(),
469 ..Default::default()
470 }
471 }
472
473 fn test_merge_scan_exec(table_id: u32, query_ctx: QueryContextRef) -> Arc<dyn ExecutionPlan> {
474 let session_state = SessionStateBuilder::new().with_default_features().build();
475 let plan = LogicalPlanBuilder::empty(false).build().unwrap();
476 let schema = ArrowSchema::empty();
477
478 Arc::new(
479 MergeScanExec::new(
480 &session_state,
481 TableName::new("greptime", "public", "test"),
482 vec![RegionId::new(table_id, 0)],
483 plan,
484 &schema,
485 Arc::new(NoopRegionQueryHandler),
486 query_ctx,
487 1,
488 BTreeMap::<String, BTreeSet<datafusion_common::Column>>::new(),
489 Some(RemoteDynFilterProducerId::new(0)),
490 false,
491 )
492 .unwrap(),
493 )
494 }
495
496 fn flow_query_ctx_with_sink_table_id(sink_table_id: u32) -> QueryContextRef {
497 Arc::new(
498 QueryContextBuilder::default()
499 .set_extension(FLOW_RETURN_REGION_SEQ.to_string(), "true".to_string())
500 .set_extension(FLOW_SINK_TABLE_ID.to_string(), sink_table_id.to_string())
501 .build(),
502 )
503 }
504
505 #[test]
506 fn terminal_metrics_returns_none_without_merge_scan() {
507 let plan: Arc<dyn ExecutionPlan> = Arc::new(EmptyExec::new(Arc::new(ArrowSchema::empty())));
508 assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
509 }
510
511 #[test]
512 fn terminal_metrics_skip_flow_sink_merge_scan_regions() {
513 let query_ctx = flow_query_ctx_with_sink_table_id(42);
514 let plan = test_merge_scan_exec(42, query_ctx);
515
516 assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
517 }
518
519 #[test]
520 fn terminal_metrics_keep_source_merge_scan_regions_with_sink_extension() {
521 let query_ctx = flow_query_ctx_with_sink_table_id(42);
522 let plan = test_merge_scan_exec(43, query_ctx);
523
524 assert_eq!(
525 terminal_recordbatch_metrics_from_plan(plan)
526 .unwrap()
527 .region_watermarks,
528 vec![RegionWatermarkEntry {
529 region_id: RegionId::new(43, 0).as_u64(),
530 watermark: None,
531 }]
532 );
533 }
534
535 #[test]
536 fn merge_merge_scan_region_watermarks_marks_missing_watermarks_unproved() {
537 let mut merged = BTreeMap::new();
538
539 merge_merge_scan_region_watermarks(&mut merged, [1, 2], std::iter::empty());
540
541 assert_eq!(
542 finalize_region_watermarks(merged),
543 vec![
544 RegionWatermarkEntry {
545 region_id: 1,
546 watermark: None,
547 },
548 RegionWatermarkEntry {
549 region_id: 2,
550 watermark: None,
551 },
552 ]
553 );
554 }
555
556 #[test]
557 fn merge_merge_scan_region_watermarks_keeps_matching_proved_values() {
558 let mut merged = BTreeMap::new();
559
560 merge_merge_scan_region_watermarks(
561 &mut merged,
562 [42],
563 [
564 metrics_with_region_watermarks(&[(42, Some(7))]),
565 metrics_with_region_watermarks(&[(42, Some(7))]),
566 ],
567 );
568
569 assert_eq!(
570 finalize_region_watermarks(merged),
571 vec![RegionWatermarkEntry {
572 region_id: 42,
573 watermark: Some(7),
574 }]
575 );
576 }
577
578 #[test]
579 fn merge_merge_scan_region_watermarks_degrades_conflicting_proved_values() {
580 let mut merged = BTreeMap::new();
581
582 merge_merge_scan_region_watermarks(
583 &mut merged,
584 [7],
585 [
586 metrics_with_region_watermarks(&[(7, Some(11))]),
587 metrics_with_region_watermarks(&[(7, Some(13))]),
588 ],
589 );
590
591 assert_eq!(
592 finalize_region_watermarks(merged),
593 vec![RegionWatermarkEntry {
594 region_id: 7,
595 watermark: None,
596 }]
597 );
598 }
599
600 #[test]
601 fn merge_merge_scan_region_watermarks_none_vetoes_proved_value() {
602 let mut merged = BTreeMap::new();
603
604 merge_merge_scan_region_watermarks(
605 &mut merged,
606 [9],
607 [
608 metrics_with_region_watermarks(&[(9, Some(21))]),
609 metrics_with_region_watermarks(&[(9, None)]),
610 ],
611 );
612
613 assert_eq!(
614 finalize_region_watermarks(merged),
615 vec![RegionWatermarkEntry {
616 region_id: 9,
617 watermark: None,
618 }]
619 );
620 }
621
622 #[test]
623 fn merge_merge_scan_region_watermarks_none_vetoes_proved_value_regardless_of_order() {
624 let mut merged = BTreeMap::new();
625
626 merge_merge_scan_region_watermarks(
627 &mut merged,
628 [9],
629 [
630 metrics_with_region_watermarks(&[(9, None)]),
631 metrics_with_region_watermarks(&[(9, Some(21))]),
632 ],
633 );
634
635 assert_eq!(
636 finalize_region_watermarks(merged),
637 vec![RegionWatermarkEntry {
638 region_id: 9,
639 watermark: None,
640 }]
641 );
642 }
643
644 #[test]
645 fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value() {
646 let mut merged = BTreeMap::new();
647
648 merge_merge_scan_region_watermarks(
649 &mut merged,
650 [9],
651 [metrics_with_region_watermarks(&[(9, Some(21))])],
652 );
653 merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
654
655 assert_eq!(
656 finalize_region_watermarks(merged),
657 vec![RegionWatermarkEntry {
658 region_id: 9,
659 watermark: None,
660 }]
661 );
662 }
663
664 #[test]
665 fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value_regardless_of_order() {
666 let mut merged = BTreeMap::new();
667
668 merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
669 merge_merge_scan_region_watermarks(
670 &mut merged,
671 [9],
672 [metrics_with_region_watermarks(&[(9, Some(21))])],
673 );
674
675 assert_eq!(
676 finalize_region_watermarks(merged),
677 vec![RegionWatermarkEntry {
678 region_id: 9,
679 watermark: None,
680 }]
681 );
682 }
683}