1use std::collections::BTreeMap;
16use std::pin::Pin;
17use std::sync::Arc;
18use std::task::{Context, Poll};
19
20use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
21use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
22use common_telemetry::warn;
23use datafusion::physical_plan::ExecutionPlan;
24use datatypes::schema::SchemaRef;
25use futures::Stream;
26use futures_util::ready;
27use lazy_static::lazy_static;
28use prometheus::*;
29use session::context::QueryContextRef;
30
31use crate::dist_plan::MergeScanExec;
32use crate::error::Result;
33use crate::options::FlowQueryExtensions;
34
35enum MergeState {
38 Participated,
41 Unproved,
44 Proved(u64),
46 Conflict {
49 watermarks: Vec<u64>,
51 },
52}
53
54lazy_static! {
55 pub static ref QUERY_STAGE_ELAPSED: HistogramVec = register_histogram_vec!(
57 "greptime_query_stage_elapsed",
58 "query engine time elapsed during each stage",
59 &["stage"],
60 vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
61 )
62 .unwrap();
63 pub static ref PARSE_SQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
64 .with_label_values(&["parse_sql"]);
65 pub static ref PARSE_PROMQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
66 .with_label_values(&["parse_promql"]);
67 pub static ref OPTIMIZE_LOGICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
68 .with_label_values(&["optimize_logicalplan"]);
69 pub static ref OPTIMIZE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
70 .with_label_values(&["optimize_physicalplan"]);
71 pub static ref CREATE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
72 .with_label_values(&["create_physicalplan"]);
73 pub static ref EXEC_PLAN_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
74 .with_label_values(&["execute_plan"]);
75 pub static ref MERGE_SCAN_POLL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
76 .with_label_values(&["merge_scan_poll"]);
77
78 pub static ref MERGE_SCAN_REGIONS: Histogram = register_histogram!(
79 "greptime_query_merge_scan_regions",
80 "query merge scan regions"
81 )
82 .unwrap();
83 pub static ref MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!(
84 "greptime_query_merge_scan_errors_total",
85 "query merge scan errors total"
86 )
87 .unwrap();
88 pub static ref PUSH_DOWN_FALLBACK_ERRORS_TOTAL: IntCounter = register_int_counter!(
89 "greptime_push_down_fallback_errors_total",
90 "query push down fallback errors total"
91 )
92 .unwrap();
93
94 pub static ref QUERY_MEMORY_POOL_USAGE_BYTES: IntGauge = register_int_gauge!(
95 "greptime_query_memory_pool_usage_bytes",
96 "current query memory pool usage in bytes"
97 )
98 .unwrap();
99
100 pub static ref QUERY_MEMORY_POOL_REJECTED_TOTAL: IntCounter = register_int_counter!(
101 "greptime_query_memory_pool_rejected_total",
102 "total number of query memory allocations rejected"
103 )
104 .unwrap();
105}
106
107pub struct OnDone<F> {
109 stream: SendableRecordBatchStream,
110 callback: Option<F>,
111}
112
113impl<F> OnDone<F> {
114 pub fn new(stream: SendableRecordBatchStream, callback: F) -> Self {
116 Self {
117 stream,
118 callback: Some(callback),
119 }
120 }
121}
122
123impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
124 fn name(&self) -> &str {
125 self.stream.name()
126 }
127
128 fn schema(&self) -> SchemaRef {
129 self.stream.schema()
130 }
131
132 fn output_ordering(&self) -> Option<&[OrderOption]> {
133 self.stream.output_ordering()
134 }
135
136 fn metrics(&self) -> Option<RecordBatchMetrics> {
137 self.stream.metrics()
138 }
139}
140
141impl<F: FnOnce() + Unpin> Stream for OnDone<F> {
142 type Item = common_recordbatch::error::Result<RecordBatch>;
143
144 fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
145 match ready!(Pin::new(&mut self.stream).poll_next(cx)) {
146 Some(rb) => Poll::Ready(Some(rb)),
147 None => {
148 if let Some(callback) = self.callback.take() {
149 callback();
150 }
151 Poll::Ready(None)
152 }
153 }
154 }
155
156 fn size_hint(&self) -> (usize, Option<usize>) {
157 self.stream.size_hint()
158 }
159}
160
161pub struct RegionWatermarkMetricsStream {
162 stream: SendableRecordBatchStream,
163 plan: Arc<dyn ExecutionPlan>,
164}
165
166impl RegionWatermarkMetricsStream {
167 pub fn new(stream: SendableRecordBatchStream, plan: Arc<dyn ExecutionPlan>) -> Self {
168 Self { stream, plan }
169 }
170}
171
172impl RecordBatchStream for RegionWatermarkMetricsStream {
173 fn name(&self) -> &str {
174 self.stream.name()
175 }
176
177 fn schema(&self) -> SchemaRef {
178 self.stream.schema()
179 }
180
181 fn output_ordering(&self) -> Option<&[OrderOption]> {
182 self.stream.output_ordering()
183 }
184
185 fn metrics(&self) -> Option<RecordBatchMetrics> {
186 let mut metrics = self.stream.metrics()?;
187 let region_watermarks = collect_region_watermarks(self.plan.clone());
188 if !region_watermarks.is_empty() {
189 metrics.region_watermarks = region_watermarks;
190 }
191 Some(metrics)
192 }
193}
194
195impl Stream for RegionWatermarkMetricsStream {
196 type Item = common_recordbatch::error::Result<RecordBatch>;
197
198 fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
199 Pin::new(&mut self.stream).poll_next(cx)
200 }
201
202 fn size_hint(&self) -> (usize, Option<usize>) {
203 self.stream.size_hint()
204 }
205}
206
207pub fn should_collect_region_watermark_from_query_ctx(query_ctx: &QueryContextRef) -> Result<bool> {
209 Ok(
210 FlowQueryExtensions::parse_flow_extensions(&query_ctx.extensions())?
211 .is_some_and(|extensions| extensions.should_collect_region_watermark()),
212 )
213}
214
215pub fn maybe_attach_region_watermark_metrics(
217 stream: SendableRecordBatchStream,
218 plan: Arc<dyn ExecutionPlan>,
219 should_collect_region_watermark: bool,
220) -> SendableRecordBatchStream {
221 if should_collect_region_watermark {
222 Box::pin(RegionWatermarkMetricsStream::new(stream, plan))
223 } else {
224 stream
225 }
226}
227
228pub fn terminal_recordbatch_metrics_from_plan(
229 plan: Arc<dyn ExecutionPlan>,
230) -> Option<RecordBatchMetrics> {
231 let region_watermarks = collect_region_watermarks(plan);
232 if region_watermarks.is_empty() {
233 None
234 } else {
235 Some(RecordBatchMetrics {
236 region_watermarks,
237 ..Default::default()
238 })
239 }
240}
241
242pub fn terminal_recordbatch_metrics_from_plan_if_requested(
244 plan: Option<Arc<dyn ExecutionPlan>>,
245 should_collect_region_watermark: bool,
246) -> Option<RecordBatchMetrics> {
247 if should_collect_region_watermark {
248 plan.and_then(terminal_recordbatch_metrics_from_plan)
249 } else {
250 None
251 }
252}
253
254fn collect_region_watermarks(plan: Arc<dyn ExecutionPlan>) -> Vec<RegionWatermarkEntry> {
255 let mut merged = BTreeMap::<u64, MergeState>::new();
256 let mut stack = vec![plan];
257
258 while let Some(plan) = stack.pop() {
259 if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>() {
260 merge_merge_scan_region_watermarks(
261 &mut merged,
262 merge_scan
263 .regions()
264 .iter()
265 .map(|region_id| region_id.as_u64()),
266 merge_scan.sub_stage_metrics(),
267 );
268 }
269 stack.extend(plan.children().into_iter().cloned());
270 }
271
272 finalize_region_watermarks(merged)
273}
274
275fn merge_region_watermark_entries(
295 merged: &mut BTreeMap<u64, MergeState>,
296 entries: impl IntoIterator<Item = RegionWatermarkEntry>,
297) {
298 for entry in entries {
299 merged
300 .entry(entry.region_id)
301 .and_modify(|existing| match entry.watermark {
302 None => match existing {
303 MergeState::Participated | MergeState::Proved(_) => {
304 *existing = MergeState::Unproved;
305 }
306 MergeState::Unproved | MergeState::Conflict { .. } => {}
307 },
308 Some(seq) => match existing {
309 MergeState::Participated => {
310 *existing = MergeState::Proved(seq);
311 }
312 MergeState::Unproved => {}
313 MergeState::Proved(existing_seq) if *existing_seq == seq => {}
314 MergeState::Proved(existing_seq) => {
315 let old_seq = *existing_seq;
316 *existing = MergeState::Conflict {
317 watermarks: vec![old_seq, seq],
318 };
319 }
320 MergeState::Conflict { watermarks } => {
321 if !watermarks.contains(&seq) {
322 watermarks.push(seq);
323 }
324 }
325 },
326 })
327 .or_insert(match entry.watermark {
328 Some(seq) => MergeState::Proved(seq),
329 None => MergeState::Unproved,
330 });
331 }
332}
333
334fn merge_merge_scan_region_watermarks(
335 merged: &mut BTreeMap<u64, MergeState>,
336 regions: impl IntoIterator<Item = u64>,
337 sub_stage_metrics: impl IntoIterator<Item = RecordBatchMetrics>,
338) {
339 for region_id in regions {
343 merged.entry(region_id).or_insert(MergeState::Participated);
344 }
345
346 for metrics in sub_stage_metrics {
347 merge_region_watermark_entries(merged, metrics.region_watermarks);
348 }
349}
350
351fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWatermarkEntry> {
352 merged
353 .into_iter()
354 .map(|(region_id, state)| RegionWatermarkEntry {
355 region_id,
356 watermark: match state {
357 MergeState::Participated => None,
358 MergeState::Unproved => None,
359 MergeState::Proved(seq) => Some(seq),
360 MergeState::Conflict { watermarks } => {
361 warn!(
362 "Conflicting proved watermarks for region {}: {:?}; degrading to unproved",
363 region_id, watermarks
364 );
365 None
366 }
367 },
368 })
369 .collect()
370}
371
372#[cfg(test)]
373mod tests {
374 use datafusion::arrow::datatypes::Schema as ArrowSchema;
375 use datafusion::physical_plan::empty::EmptyExec;
376
377 use super::*;
378
379 fn metrics_with_region_watermarks(entries: &[(u64, Option<u64>)]) -> RecordBatchMetrics {
380 RecordBatchMetrics {
381 region_watermarks: entries
382 .iter()
383 .map(|(region_id, watermark)| RegionWatermarkEntry {
384 region_id: *region_id,
385 watermark: *watermark,
386 })
387 .collect(),
388 ..Default::default()
389 }
390 }
391
392 #[test]
393 fn terminal_metrics_returns_none_without_merge_scan() {
394 let plan: Arc<dyn ExecutionPlan> = Arc::new(EmptyExec::new(Arc::new(ArrowSchema::empty())));
395 assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
396 }
397
398 #[test]
399 fn merge_merge_scan_region_watermarks_marks_missing_watermarks_unproved() {
400 let mut merged = BTreeMap::new();
401
402 merge_merge_scan_region_watermarks(&mut merged, [1, 2], std::iter::empty());
403
404 assert_eq!(
405 finalize_region_watermarks(merged),
406 vec![
407 RegionWatermarkEntry {
408 region_id: 1,
409 watermark: None,
410 },
411 RegionWatermarkEntry {
412 region_id: 2,
413 watermark: None,
414 },
415 ]
416 );
417 }
418
419 #[test]
420 fn merge_merge_scan_region_watermarks_keeps_matching_proved_values() {
421 let mut merged = BTreeMap::new();
422
423 merge_merge_scan_region_watermarks(
424 &mut merged,
425 [42],
426 [
427 metrics_with_region_watermarks(&[(42, Some(7))]),
428 metrics_with_region_watermarks(&[(42, Some(7))]),
429 ],
430 );
431
432 assert_eq!(
433 finalize_region_watermarks(merged),
434 vec![RegionWatermarkEntry {
435 region_id: 42,
436 watermark: Some(7),
437 }]
438 );
439 }
440
441 #[test]
442 fn merge_merge_scan_region_watermarks_degrades_conflicting_proved_values() {
443 let mut merged = BTreeMap::new();
444
445 merge_merge_scan_region_watermarks(
446 &mut merged,
447 [7],
448 [
449 metrics_with_region_watermarks(&[(7, Some(11))]),
450 metrics_with_region_watermarks(&[(7, Some(13))]),
451 ],
452 );
453
454 assert_eq!(
455 finalize_region_watermarks(merged),
456 vec![RegionWatermarkEntry {
457 region_id: 7,
458 watermark: None,
459 }]
460 );
461 }
462
463 #[test]
464 fn merge_merge_scan_region_watermarks_none_vetoes_proved_value() {
465 let mut merged = BTreeMap::new();
466
467 merge_merge_scan_region_watermarks(
468 &mut merged,
469 [9],
470 [
471 metrics_with_region_watermarks(&[(9, Some(21))]),
472 metrics_with_region_watermarks(&[(9, None)]),
473 ],
474 );
475
476 assert_eq!(
477 finalize_region_watermarks(merged),
478 vec![RegionWatermarkEntry {
479 region_id: 9,
480 watermark: None,
481 }]
482 );
483 }
484
485 #[test]
486 fn merge_merge_scan_region_watermarks_none_vetoes_proved_value_regardless_of_order() {
487 let mut merged = BTreeMap::new();
488
489 merge_merge_scan_region_watermarks(
490 &mut merged,
491 [9],
492 [
493 metrics_with_region_watermarks(&[(9, None)]),
494 metrics_with_region_watermarks(&[(9, Some(21))]),
495 ],
496 );
497
498 assert_eq!(
499 finalize_region_watermarks(merged),
500 vec![RegionWatermarkEntry {
501 region_id: 9,
502 watermark: None,
503 }]
504 );
505 }
506}