1use std::collections::{BTreeMap, BTreeSet};
16use std::pin::Pin;
17use std::sync::Arc;
18use std::task::{Context, Poll};
19
20use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
21use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
22use common_telemetry::warn;
23use datafusion::physical_plan::ExecutionPlan;
24use datatypes::schema::SchemaRef;
25use futures::Stream;
26use futures_util::ready;
27use lazy_static::lazy_static;
28use prometheus::*;
29use session::context::QueryContextRef;
30
31use crate::dist_plan::MergeScanExec;
32use crate::error::Result;
33use crate::options::FlowQueryExtensions;
34
35enum MergeState {
38 Unproved,
41 Proved(u64),
43 Conflict {
46 watermarks: Vec<u64>,
48 },
49}
50
51lazy_static! {
52 pub static ref QUERY_STAGE_ELAPSED: HistogramVec = register_histogram_vec!(
54 "greptime_query_stage_elapsed",
55 "query engine time elapsed during each stage",
56 &["stage"],
57 vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
58 )
59 .unwrap();
60 pub static ref PARSE_SQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
61 .with_label_values(&["parse_sql"]);
62 pub static ref PARSE_PROMQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
63 .with_label_values(&["parse_promql"]);
64 pub static ref OPTIMIZE_LOGICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
65 .with_label_values(&["optimize_logicalplan"]);
66 pub static ref OPTIMIZE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
67 .with_label_values(&["optimize_physicalplan"]);
68 pub static ref CREATE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
69 .with_label_values(&["create_physicalplan"]);
70 pub static ref EXEC_PLAN_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
71 .with_label_values(&["execute_plan"]);
72 pub static ref MERGE_SCAN_POLL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
73 .with_label_values(&["merge_scan_poll"]);
74
75 pub static ref MERGE_SCAN_REGIONS: Histogram = register_histogram!(
76 "greptime_query_merge_scan_regions",
77 "query merge scan regions"
78 )
79 .unwrap();
80 pub static ref MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!(
81 "greptime_query_merge_scan_errors_total",
82 "query merge scan errors total"
83 )
84 .unwrap();
85 pub static ref PUSH_DOWN_FALLBACK_ERRORS_TOTAL: IntCounter = register_int_counter!(
86 "greptime_push_down_fallback_errors_total",
87 "query push down fallback errors total"
88 )
89 .unwrap();
90
91 pub static ref QUERY_MEMORY_POOL_USAGE_BYTES: IntGauge = register_int_gauge!(
92 "greptime_query_memory_pool_usage_bytes",
93 "current query memory pool usage in bytes"
94 )
95 .unwrap();
96
97 pub static ref QUERY_MEMORY_POOL_REJECTED_TOTAL: IntCounter = register_int_counter!(
98 "greptime_query_memory_pool_rejected_total",
99 "total number of query memory allocations rejected"
100 )
101 .unwrap();
102}
103
104pub struct OnDone<F> {
106 stream: SendableRecordBatchStream,
107 callback: Option<F>,
108}
109
110impl<F> OnDone<F> {
111 pub fn new(stream: SendableRecordBatchStream, callback: F) -> Self {
113 Self {
114 stream,
115 callback: Some(callback),
116 }
117 }
118}
119
120impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
121 fn name(&self) -> &str {
122 self.stream.name()
123 }
124
125 fn schema(&self) -> SchemaRef {
126 self.stream.schema()
127 }
128
129 fn output_ordering(&self) -> Option<&[OrderOption]> {
130 self.stream.output_ordering()
131 }
132
133 fn metrics(&self) -> Option<RecordBatchMetrics> {
134 self.stream.metrics()
135 }
136}
137
138impl<F: FnOnce() + Unpin> Stream for OnDone<F> {
139 type Item = common_recordbatch::error::Result<RecordBatch>;
140
141 fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
142 match ready!(Pin::new(&mut self.stream).poll_next(cx)) {
143 Some(rb) => Poll::Ready(Some(rb)),
144 None => {
145 if let Some(callback) = self.callback.take() {
146 callback();
147 }
148 Poll::Ready(None)
149 }
150 }
151 }
152
153 fn size_hint(&self) -> (usize, Option<usize>) {
154 self.stream.size_hint()
155 }
156}
157
158pub struct RegionWatermarkMetricsStream {
159 stream: SendableRecordBatchStream,
160 plan: Arc<dyn ExecutionPlan>,
161}
162
163impl RegionWatermarkMetricsStream {
164 pub fn new(stream: SendableRecordBatchStream, plan: Arc<dyn ExecutionPlan>) -> Self {
165 Self { stream, plan }
166 }
167}
168
169impl RecordBatchStream for RegionWatermarkMetricsStream {
170 fn name(&self) -> &str {
171 self.stream.name()
172 }
173
174 fn schema(&self) -> SchemaRef {
175 self.stream.schema()
176 }
177
178 fn output_ordering(&self) -> Option<&[OrderOption]> {
179 self.stream.output_ordering()
180 }
181
182 fn metrics(&self) -> Option<RecordBatchMetrics> {
183 let mut metrics = self.stream.metrics()?;
184 let region_watermarks = collect_region_watermarks(self.plan.clone());
185 if !region_watermarks.is_empty() {
186 metrics.region_watermarks = region_watermarks;
187 }
188 Some(metrics)
189 }
190}
191
192impl Stream for RegionWatermarkMetricsStream {
193 type Item = common_recordbatch::error::Result<RecordBatch>;
194
195 fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
196 Pin::new(&mut self.stream).poll_next(cx)
197 }
198
199 fn size_hint(&self) -> (usize, Option<usize>) {
200 self.stream.size_hint()
201 }
202}
203
204pub fn should_collect_region_watermark_from_query_ctx(query_ctx: &QueryContextRef) -> Result<bool> {
206 Ok(
207 FlowQueryExtensions::parse_flow_extensions(&query_ctx.extensions())?
208 .is_some_and(|extensions| extensions.should_collect_region_watermark()),
209 )
210}
211
212pub fn maybe_attach_region_watermark_metrics(
214 stream: SendableRecordBatchStream,
215 plan: Arc<dyn ExecutionPlan>,
216 should_collect_region_watermark: bool,
217) -> SendableRecordBatchStream {
218 if should_collect_region_watermark {
219 Box::pin(RegionWatermarkMetricsStream::new(stream, plan))
220 } else {
221 stream
222 }
223}
224
225pub fn terminal_recordbatch_metrics_from_plan(
226 plan: Arc<dyn ExecutionPlan>,
227) -> Option<RecordBatchMetrics> {
228 let region_watermarks = collect_region_watermarks(plan);
229 if region_watermarks.is_empty() {
230 None
231 } else {
232 Some(RecordBatchMetrics {
233 region_watermarks,
234 ..Default::default()
235 })
236 }
237}
238
239pub fn terminal_recordbatch_metrics_from_plan_if_requested(
241 plan: Option<Arc<dyn ExecutionPlan>>,
242 should_collect_region_watermark: bool,
243) -> Option<RecordBatchMetrics> {
244 if should_collect_region_watermark {
245 plan.and_then(terminal_recordbatch_metrics_from_plan)
246 } else {
247 None
248 }
249}
250
251fn collect_region_watermarks(plan: Arc<dyn ExecutionPlan>) -> Vec<RegionWatermarkEntry> {
252 let mut merged = BTreeMap::<u64, MergeState>::new();
253 let mut stack = vec![plan];
254
255 while let Some(plan) = stack.pop() {
256 if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>()
257 && !merge_scan.is_flow_sink_scan()
258 {
259 merge_merge_scan_region_watermarks(
260 &mut merged,
261 merge_scan
262 .regions()
263 .iter()
264 .map(|region_id| region_id.as_u64()),
265 merge_scan.sub_stage_metrics(),
266 );
267 }
268 stack.extend(plan.children().into_iter().cloned());
269 }
270
271 finalize_region_watermarks(merged)
272}
273
274fn merge_region_watermark_entries(
292 merged: &mut BTreeMap<u64, MergeState>,
293 entries: impl IntoIterator<Item = RegionWatermarkEntry>,
294) {
295 for entry in entries {
296 merged
297 .entry(entry.region_id)
298 .and_modify(|existing| match entry.watermark {
299 None => match existing {
300 MergeState::Proved(_) => {
301 *existing = MergeState::Unproved;
302 }
303 MergeState::Unproved | MergeState::Conflict { .. } => {}
304 },
305 Some(seq) => match existing {
306 MergeState::Unproved => {}
307 MergeState::Proved(existing_seq) if *existing_seq == seq => {}
308 MergeState::Proved(existing_seq) => {
309 let old_seq = *existing_seq;
310 *existing = MergeState::Conflict {
311 watermarks: vec![old_seq, seq],
312 };
313 }
314 MergeState::Conflict { watermarks } => {
315 if !watermarks.contains(&seq) {
316 watermarks.push(seq);
317 }
318 }
319 },
320 })
321 .or_insert(match entry.watermark {
322 Some(seq) => MergeState::Proved(seq),
323 None => MergeState::Unproved,
324 });
325 }
326}
327
328fn merge_merge_scan_region_watermarks(
329 merged: &mut BTreeMap<u64, MergeState>,
330 regions: impl IntoIterator<Item = u64>,
331 sub_stage_metrics: impl IntoIterator<Item = RecordBatchMetrics>,
332) {
333 let regions = regions.into_iter().collect::<Vec<_>>();
334 let mut proved_or_unproved_regions = BTreeSet::new();
335 for metrics in sub_stage_metrics {
336 proved_or_unproved_regions.extend(
337 metrics
338 .region_watermarks
339 .iter()
340 .map(|entry| entry.region_id),
341 );
342 merge_region_watermark_entries(merged, metrics.region_watermarks);
343 }
344
345 merge_region_watermark_entries(
350 merged,
351 regions
352 .into_iter()
353 .filter(|region_id| !proved_or_unproved_regions.contains(region_id))
354 .map(|region_id| RegionWatermarkEntry {
355 region_id,
356 watermark: None,
357 }),
358 );
359}
360
361fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWatermarkEntry> {
362 merged
363 .into_iter()
364 .map(|(region_id, state)| RegionWatermarkEntry {
365 region_id,
366 watermark: match state {
367 MergeState::Unproved => None,
368 MergeState::Proved(seq) => Some(seq),
369 MergeState::Conflict { watermarks } => {
370 warn!(
371 "Conflicting proved watermarks for region {}: {:?}; degrading to unproved",
372 region_id, watermarks
373 );
374 None
375 }
376 },
377 })
378 .collect()
379}
380
381#[cfg(test)]
382mod tests {
383 use std::collections::{BTreeMap, BTreeSet};
384 use std::sync::Arc;
385
386 use async_trait::async_trait;
387 use datafusion::arrow::datatypes::Schema as ArrowSchema;
388 use datafusion::execution::SessionStateBuilder;
389 use datafusion::physical_plan::empty::EmptyExec;
390 use datafusion_expr::LogicalPlanBuilder;
391 use session::ReadPreference;
392 use session::context::QueryContextBuilder;
393 use store_api::storage::RegionId;
394 use table::table_name::TableName;
395
396 use super::*;
397 use crate::options::{FLOW_RETURN_REGION_SEQ, FLOW_SINK_TABLE_ID};
398 use crate::region_query::RegionQueryHandler;
399
400 struct NoopRegionQueryHandler;
401
402 #[async_trait]
403 impl RegionQueryHandler for NoopRegionQueryHandler {
404 async fn do_get(
405 &self,
406 _read_preference: ReadPreference,
407 _request: common_query::request::QueryRequest,
408 ) -> Result<SendableRecordBatchStream> {
409 unreachable!("metrics tests should not execute remote queries")
410 }
411 }
412
413 fn metrics_with_region_watermarks(entries: &[(u64, Option<u64>)]) -> RecordBatchMetrics {
414 RecordBatchMetrics {
415 region_watermarks: entries
416 .iter()
417 .map(|(region_id, watermark)| RegionWatermarkEntry {
418 region_id: *region_id,
419 watermark: *watermark,
420 })
421 .collect(),
422 ..Default::default()
423 }
424 }
425
426 fn test_merge_scan_exec(table_id: u32, query_ctx: QueryContextRef) -> Arc<dyn ExecutionPlan> {
427 let session_state = SessionStateBuilder::new().with_default_features().build();
428 let plan = LogicalPlanBuilder::empty(false).build().unwrap();
429 let schema = ArrowSchema::empty();
430
431 Arc::new(
432 MergeScanExec::new(
433 &session_state,
434 TableName::new("greptime", "public", "test"),
435 vec![RegionId::new(table_id, 0)],
436 plan,
437 &schema,
438 Arc::new(NoopRegionQueryHandler),
439 query_ctx,
440 1,
441 BTreeMap::<String, BTreeSet<datafusion_common::Column>>::new(),
442 )
443 .unwrap(),
444 )
445 }
446
447 fn flow_query_ctx_with_sink_table_id(sink_table_id: u32) -> QueryContextRef {
448 Arc::new(
449 QueryContextBuilder::default()
450 .set_extension(FLOW_RETURN_REGION_SEQ.to_string(), "true".to_string())
451 .set_extension(FLOW_SINK_TABLE_ID.to_string(), sink_table_id.to_string())
452 .build(),
453 )
454 }
455
456 #[test]
457 fn terminal_metrics_returns_none_without_merge_scan() {
458 let plan: Arc<dyn ExecutionPlan> = Arc::new(EmptyExec::new(Arc::new(ArrowSchema::empty())));
459 assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
460 }
461
462 #[test]
463 fn terminal_metrics_skip_flow_sink_merge_scan_regions() {
464 let query_ctx = flow_query_ctx_with_sink_table_id(42);
465 let plan = test_merge_scan_exec(42, query_ctx);
466
467 assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
468 }
469
470 #[test]
471 fn terminal_metrics_keep_source_merge_scan_regions_with_sink_extension() {
472 let query_ctx = flow_query_ctx_with_sink_table_id(42);
473 let plan = test_merge_scan_exec(43, query_ctx);
474
475 assert_eq!(
476 terminal_recordbatch_metrics_from_plan(plan)
477 .unwrap()
478 .region_watermarks,
479 vec![RegionWatermarkEntry {
480 region_id: RegionId::new(43, 0).as_u64(),
481 watermark: None,
482 }]
483 );
484 }
485
486 #[test]
487 fn merge_merge_scan_region_watermarks_marks_missing_watermarks_unproved() {
488 let mut merged = BTreeMap::new();
489
490 merge_merge_scan_region_watermarks(&mut merged, [1, 2], std::iter::empty());
491
492 assert_eq!(
493 finalize_region_watermarks(merged),
494 vec![
495 RegionWatermarkEntry {
496 region_id: 1,
497 watermark: None,
498 },
499 RegionWatermarkEntry {
500 region_id: 2,
501 watermark: None,
502 },
503 ]
504 );
505 }
506
507 #[test]
508 fn merge_merge_scan_region_watermarks_keeps_matching_proved_values() {
509 let mut merged = BTreeMap::new();
510
511 merge_merge_scan_region_watermarks(
512 &mut merged,
513 [42],
514 [
515 metrics_with_region_watermarks(&[(42, Some(7))]),
516 metrics_with_region_watermarks(&[(42, Some(7))]),
517 ],
518 );
519
520 assert_eq!(
521 finalize_region_watermarks(merged),
522 vec![RegionWatermarkEntry {
523 region_id: 42,
524 watermark: Some(7),
525 }]
526 );
527 }
528
529 #[test]
530 fn merge_merge_scan_region_watermarks_degrades_conflicting_proved_values() {
531 let mut merged = BTreeMap::new();
532
533 merge_merge_scan_region_watermarks(
534 &mut merged,
535 [7],
536 [
537 metrics_with_region_watermarks(&[(7, Some(11))]),
538 metrics_with_region_watermarks(&[(7, Some(13))]),
539 ],
540 );
541
542 assert_eq!(
543 finalize_region_watermarks(merged),
544 vec![RegionWatermarkEntry {
545 region_id: 7,
546 watermark: None,
547 }]
548 );
549 }
550
551 #[test]
552 fn merge_merge_scan_region_watermarks_none_vetoes_proved_value() {
553 let mut merged = BTreeMap::new();
554
555 merge_merge_scan_region_watermarks(
556 &mut merged,
557 [9],
558 [
559 metrics_with_region_watermarks(&[(9, Some(21))]),
560 metrics_with_region_watermarks(&[(9, None)]),
561 ],
562 );
563
564 assert_eq!(
565 finalize_region_watermarks(merged),
566 vec![RegionWatermarkEntry {
567 region_id: 9,
568 watermark: None,
569 }]
570 );
571 }
572
573 #[test]
574 fn merge_merge_scan_region_watermarks_none_vetoes_proved_value_regardless_of_order() {
575 let mut merged = BTreeMap::new();
576
577 merge_merge_scan_region_watermarks(
578 &mut merged,
579 [9],
580 [
581 metrics_with_region_watermarks(&[(9, None)]),
582 metrics_with_region_watermarks(&[(9, Some(21))]),
583 ],
584 );
585
586 assert_eq!(
587 finalize_region_watermarks(merged),
588 vec![RegionWatermarkEntry {
589 region_id: 9,
590 watermark: None,
591 }]
592 );
593 }
594
595 #[test]
596 fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value() {
597 let mut merged = BTreeMap::new();
598
599 merge_merge_scan_region_watermarks(
600 &mut merged,
601 [9],
602 [metrics_with_region_watermarks(&[(9, Some(21))])],
603 );
604 merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
605
606 assert_eq!(
607 finalize_region_watermarks(merged),
608 vec![RegionWatermarkEntry {
609 region_id: 9,
610 watermark: None,
611 }]
612 );
613 }
614
615 #[test]
616 fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value_regardless_of_order() {
617 let mut merged = BTreeMap::new();
618
619 merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
620 merge_merge_scan_region_watermarks(
621 &mut merged,
622 [9],
623 [metrics_with_region_watermarks(&[(9, Some(21))])],
624 );
625
626 assert_eq!(
627 finalize_region_watermarks(merged),
628 vec![RegionWatermarkEntry {
629 region_id: 9,
630 watermark: None,
631 }]
632 );
633 }
634}