1use std::any::Any;
16use std::sync::{Arc, Mutex};
17use std::time::Duration;
18
19use ahash::{HashMap, HashSet};
20use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SortOptions};
21use async_stream::stream;
22use common_catalog::parse_catalog_and_schema_from_db_string;
23use common_plugins::GREPTIME_EXEC_READ_COST;
24use common_query::request::QueryRequest;
25use common_recordbatch::adapter::RecordBatchMetrics;
26use common_telemetry::tracing_context::TracingContext;
27use datafusion::execution::{SessionState, TaskContext};
28use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
29use datafusion::physical_plan::metrics::{
30 Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, MetricsSet, Time,
31};
32use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
33use datafusion::physical_plan::{
34 DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
35 SendableRecordBatchStream,
36};
37use datafusion_common::{Column as ColumnExpr, DataFusionError, Result};
38use datafusion_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore};
39use datafusion_physical_expr::expressions::Column;
40use datafusion_physical_expr::{Distribution, EquivalenceProperties, PhysicalSortExpr};
41use futures_util::StreamExt;
42use greptime_proto::v1::region::RegionRequestHeader;
43use meter_core::data::ReadItem;
44use meter_macros::read_meter;
45use session::context::QueryContextRef;
46use store_api::storage::RegionId;
47use table::table_name::TableName;
48use tokio::time::Instant;
49use tracing::{Instrument, Span};
50
51use crate::dist_plan::analyzer::AliasMapping;
52use crate::dist_plan::analyzer::utils::patch_batch_timezone;
53use crate::metrics::{MERGE_SCAN_ERRORS_TOTAL, MERGE_SCAN_POLL_ELAPSED, MERGE_SCAN_REGIONS};
54use crate::options::FlowQueryExtensions;
55use crate::region_query::RegionQueryHandlerRef;
56
57#[derive(Debug, Hash, PartialOrd, PartialEq, Eq, Clone)]
58pub struct MergeScanLogicalPlan {
59 input: LogicalPlan,
61 is_placeholder: bool,
63 partition_cols: AliasMapping,
64}
65
66impl UserDefinedLogicalNodeCore for MergeScanLogicalPlan {
67 fn name(&self) -> &str {
68 Self::name()
69 }
70
71 fn inputs(&self) -> Vec<&LogicalPlan> {
74 vec![]
75 }
76
77 fn schema(&self) -> &datafusion_common::DFSchemaRef {
78 self.input.schema()
79 }
80
81 fn expressions(&self) -> Vec<datafusion_expr::Expr> {
83 vec![]
84 }
85
86 fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
87 write!(
88 f,
89 "MergeScan [is_placeholder={}, remote_input=[\n{}\n]]",
90 self.is_placeholder, self.input
91 )
92 }
93
94 fn with_exprs_and_inputs(
95 &self,
96 _exprs: Vec<datafusion::prelude::Expr>,
97 _inputs: Vec<LogicalPlan>,
98 ) -> Result<Self> {
99 Ok(self.clone())
100 }
101}
102
103impl MergeScanLogicalPlan {
104 pub fn new(input: LogicalPlan, is_placeholder: bool, partition_cols: AliasMapping) -> Self {
105 Self {
106 input,
107 is_placeholder,
108 partition_cols,
109 }
110 }
111
112 pub fn name() -> &'static str {
113 "MergeScan"
114 }
115
116 pub fn into_logical_plan(self) -> LogicalPlan {
118 LogicalPlan::Extension(Extension {
119 node: Arc::new(self),
120 })
121 }
122
123 pub fn is_placeholder(&self) -> bool {
124 self.is_placeholder
125 }
126
127 pub fn input(&self) -> &LogicalPlan {
128 &self.input
129 }
130
131 pub fn partition_cols(&self) -> &AliasMapping {
132 &self.partition_cols
133 }
134}
135
136pub struct MergeScanExec {
137 table: TableName,
138 regions: Vec<RegionId>,
139 plan: LogicalPlan,
140 arrow_schema: ArrowSchemaRef,
141 region_query_handler: RegionQueryHandlerRef,
142 metric: ExecutionPlanMetricsSet,
143 properties: Arc<PlanProperties>,
144 sub_stage_metrics: Arc<Mutex<HashMap<RegionId, RecordBatchMetrics>>>,
146 partition_metrics: Arc<Mutex<HashMap<usize, PartitionMetrics>>>,
148 query_ctx: QueryContextRef,
149 target_partition: usize,
150 partition_cols: AliasMapping,
151}
152
153impl std::fmt::Debug for MergeScanExec {
154 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
155 f.debug_struct("MergeScanExec")
156 .field("table", &self.table)
157 .field("regions", &self.regions)
158 .field("plan", &self.plan)
159 .finish()
160 }
161}
162
163impl MergeScanExec {
164 #[allow(clippy::too_many_arguments)]
165 pub fn new(
166 session_state: &SessionState,
167 table: TableName,
168 regions: Vec<RegionId>,
169 plan: LogicalPlan,
170 arrow_schema: &ArrowSchema,
171 region_query_handler: RegionQueryHandlerRef,
172 query_ctx: QueryContextRef,
173 target_partition: usize,
174 partition_cols: AliasMapping,
175 ) -> Result<Self> {
176 let arrow_schema = Arc::new(arrow_schema.clone());
180
181 let eq_properties = if let LogicalPlan::Sort(sort) = &plan
189 && target_partition >= regions.len()
190 {
191 let lex_ordering = sort
192 .expr
193 .iter()
194 .map(|sort_expr| {
195 let physical_expr = session_state
196 .create_physical_expr(sort_expr.expr.clone(), plan.schema())?;
197 Ok(PhysicalSortExpr::new(
198 physical_expr,
199 SortOptions {
200 descending: !sort_expr.asc,
201 nulls_first: sort_expr.nulls_first,
202 },
203 ))
204 })
205 .collect::<Result<Vec<_>>>()?;
206 EquivalenceProperties::new_with_orderings(arrow_schema.clone(), vec![lex_ordering])
207 } else {
208 EquivalenceProperties::new(arrow_schema.clone())
209 };
210
211 let partition_exprs = partition_cols
212 .iter()
213 .filter_map(|col| {
214 if let Some(first_alias) = col.1.first() {
215 session_state
216 .create_physical_expr(
217 Expr::Column(ColumnExpr::new_unqualified(
218 first_alias.name().to_string(),
219 )),
220 plan.schema(),
221 )
222 .ok()
223 } else {
224 None
225 }
226 })
227 .collect();
228 let partitioning = Partitioning::Hash(partition_exprs, target_partition);
229
230 let properties = Arc::new(PlanProperties::new(
231 eq_properties,
232 partitioning,
233 EmissionType::Incremental,
234 Boundedness::Bounded,
235 ));
236 Ok(Self {
237 table,
238 regions,
239 plan,
240 arrow_schema,
241 region_query_handler,
242 metric: ExecutionPlanMetricsSet::new(),
243 sub_stage_metrics: Arc::default(),
244 partition_metrics: Arc::default(),
245 properties,
246 query_ctx,
247 target_partition,
248 partition_cols,
249 })
250 }
251
252 pub fn to_stream(
253 &self,
254 context: Arc<TaskContext>,
255 partition: usize,
256 ) -> Result<SendableRecordBatchStream> {
257 let regions = self.regions.clone();
259 let region_query_handler = self.region_query_handler.clone();
260 let metric = MergeScanMetric::new(&self.metric);
261 let arrow_schema = self.arrow_schema.clone();
262 let query_ctx = self.query_ctx.clone();
263 let sub_stage_metrics_moved = self.sub_stage_metrics.clone();
264 let partition_metrics_moved = self.partition_metrics.clone();
265 let plan = self.plan.clone();
266 let target_partition = self.target_partition;
267 let dbname = context.task_id().unwrap_or_default();
268 let tracing_context = TracingContext::from_json(context.session_id().as_str());
269 let current_channel = self.query_ctx.channel();
270 let read_preference = self.query_ctx.read_preference();
271 let explain_verbose = self.query_ctx.explain_verbose();
272
273 let stream = Box::pin(stream!({
274 if partition == 0 {
276 MERGE_SCAN_REGIONS.observe(regions.len() as f64);
277 }
278
279 let _finish_timer = metric.finish_time().timer();
280 let mut ready_timer = metric.ready_time().timer();
281 let mut first_consume_timer = Some(metric.first_consume_time().timer());
282
283 for region_id in regions
284 .iter()
285 .skip(partition)
286 .step_by(target_partition)
287 .copied()
288 {
289 let region_span = tracing_context.attach(tracing::info_span!(
290 parent: &Span::current(),
291 "merge_scan_region",
292 region_id = %region_id,
293 partition = partition
294 ));
295 let request = QueryRequest {
296 header: Some(RegionRequestHeader {
297 tracing_context: tracing_context.to_w3c(),
298 dbname: dbname.clone(),
299 query_context: Some(query_ctx.as_ref().into()),
300 }),
301 region_id,
302 plan: plan.clone(),
303 };
304 let region_start = Instant::now();
305 let do_get_start = Instant::now();
306
307 if explain_verbose {
308 common_telemetry::info!(
309 "Merge scan one region, partition: {}, region_id: {}",
310 partition,
311 region_id
312 );
313 }
314
315 let mut stream = region_query_handler
316 .do_get(read_preference, request)
317 .instrument(region_span.clone())
318 .await
319 .map_err(|e| {
320 MERGE_SCAN_ERRORS_TOTAL.inc();
321 DataFusionError::External(Box::new(e))
322 })?;
323 let do_get_cost = do_get_start.elapsed();
324
325 ready_timer.stop();
326
327 let mut poll_duration = Duration::ZERO;
328 let mut poll_timer = Instant::now();
329 while let Some(batch) = stream.next().instrument(region_span.clone()).await {
330 let poll_elapsed = poll_timer.elapsed();
331 poll_duration += poll_elapsed;
332
333 let batch = batch.map_err(|e| DataFusionError::External(Box::new(e)))?;
334 let batch = patch_batch_timezone(
335 arrow_schema.clone(),
336 batch.into_df_record_batch().columns().to_vec(),
337 )?;
338 metric.record_output_batch_rows(batch.num_rows());
339 if let Some(mut first_consume_timer) = first_consume_timer.take() {
340 first_consume_timer.stop();
341 }
342
343 if let Some(metrics) = stream.metrics() {
344 let mut sub_stage_metrics = sub_stage_metrics_moved.lock().unwrap();
345 sub_stage_metrics.insert(region_id, metrics);
346 }
347
348 yield Ok(batch);
349 poll_timer = Instant::now();
351 }
352 let total_cost = region_start.elapsed();
353
354 let region_metrics = RegionMetrics {
356 region_id,
357 poll_duration,
358 do_get_cost,
359 total_cost,
360 };
361
362 {
364 let mut partition_metrics_guard = partition_metrics_moved.lock().unwrap();
365 let partition_metrics = partition_metrics_guard
366 .entry(partition)
367 .or_insert_with(|| PartitionMetrics::new(partition, explain_verbose));
368 partition_metrics.add_region_metrics(region_metrics);
369 }
370
371 if explain_verbose {
372 common_telemetry::info!(
373 "Merge scan finish one region, partition: {}, region_id: {}, poll_duration: {:?}, first_consume: {}, do_get_cost: {:?}",
374 partition,
375 region_id,
376 poll_duration,
377 metric.first_consume_time(),
378 do_get_cost
379 );
380 }
381
382 if let Some(metrics) = stream.metrics() {
384 let (c, s) = parse_catalog_and_schema_from_db_string(&dbname);
385 let value = read_meter!(
386 c,
387 s,
388 ReadItem {
389 cpu_time: metrics.elapsed_compute as u64,
390 table_scan: metrics.memory_usage as u64
391 },
392 current_channel as u8
393 );
394 metric.record_greptime_exec_cost(value as usize);
395
396 let mut sub_stage_metrics = sub_stage_metrics_moved.lock().unwrap();
398 sub_stage_metrics.insert(region_id, metrics);
399 }
400
401 MERGE_SCAN_POLL_ELAPSED.observe(poll_duration.as_secs_f64());
402 }
403
404 {
406 let mut partition_metrics_guard = partition_metrics_moved.lock().unwrap();
407 if let Some(partition_metrics) = partition_metrics_guard.get_mut(&partition) {
408 partition_metrics.finish();
409 }
410 }
411 }));
412
413 Ok(Box::pin(RecordBatchStreamAdapter::new(
414 self.arrow_schema.clone(),
415 stream,
416 )))
417 }
418
419 pub fn try_with_new_distribution(&self, distribution: Distribution) -> Option<Self> {
420 let Distribution::HashPartitioned(hash_exprs) = distribution else {
421 return None;
423 };
424
425 if let Partitioning::Hash(curr_dist, _) = &self.properties.partitioning
426 && curr_dist == &hash_exprs
427 {
428 return None;
430 }
431
432 let all_partition_col_aliases: HashSet<_> = self
433 .partition_cols
434 .values()
435 .flat_map(|aliases| aliases.iter().map(|c| c.name()))
436 .collect();
437 let overlaps: Vec<_> = hash_exprs
438 .iter()
439 .filter(|expr| {
440 expr.as_any()
441 .downcast_ref::<Column>()
442 .is_some_and(|col_expr| all_partition_col_aliases.contains(col_expr.name()))
443 })
444 .cloned()
445 .collect();
446
447 if overlaps.is_empty() {
448 return None;
449 }
450
451 Some(Self {
452 table: self.table.clone(),
453 regions: self.regions.clone(),
454 plan: self.plan.clone(),
455 arrow_schema: self.arrow_schema.clone(),
456 region_query_handler: self.region_query_handler.clone(),
457 metric: self.metric.clone(),
458 properties: Arc::new(PlanProperties::new(
459 self.properties.eq_properties.clone(),
460 Partitioning::Hash(overlaps, self.target_partition),
461 self.properties.emission_type,
462 self.properties.boundedness,
463 )),
464 sub_stage_metrics: self.sub_stage_metrics.clone(),
465 partition_metrics: self.partition_metrics.clone(),
466 query_ctx: self.query_ctx.clone(),
467 target_partition: self.target_partition,
468 partition_cols: self.partition_cols.clone(),
469 })
470 }
471
472 pub fn sub_stage_metrics(&self) -> Vec<RecordBatchMetrics> {
473 self.sub_stage_metrics
474 .lock()
475 .unwrap()
476 .values()
477 .cloned()
478 .collect()
479 }
480
481 pub fn regions(&self) -> &[RegionId] {
482 &self.regions
483 }
484
485 pub fn is_flow_sink_scan(&self) -> bool {
486 let Some(sink_table_id) =
487 FlowQueryExtensions::parse_flow_extensions(&self.query_ctx.extensions())
488 .ok()
489 .flatten()
490 .and_then(|extensions| extensions.sink_table_id)
491 else {
492 return false;
493 };
494
495 !self.regions.is_empty()
496 && self
497 .regions
498 .iter()
499 .all(|region_id| region_id.table_id() == sink_table_id)
500 }
501
502 pub fn partition_count(&self) -> usize {
503 self.target_partition
504 }
505
506 pub fn region_count(&self) -> usize {
507 self.regions.len()
508 }
509
510 fn partition_metrics(&self) -> Vec<PartitionMetrics> {
511 self.partition_metrics
512 .lock()
513 .unwrap()
514 .values()
515 .cloned()
516 .collect()
517 }
518}
519
520#[derive(Debug, Clone)]
522struct RegionMetrics {
523 region_id: RegionId,
524 poll_duration: Duration,
525 do_get_cost: Duration,
526 total_cost: Duration,
528}
529
530#[derive(Debug, Clone)]
532struct PartitionMetrics {
533 partition: usize,
534 region_metrics: Vec<RegionMetrics>,
535 total_poll_duration: Duration,
536 total_do_get_cost: Duration,
537 total_regions: usize,
538 explain_verbose: bool,
539 finished: bool,
540}
541
542impl PartitionMetrics {
543 fn new(partition: usize, explain_verbose: bool) -> Self {
544 Self {
545 partition,
546 region_metrics: Vec::new(),
547 total_poll_duration: Duration::ZERO,
548 total_do_get_cost: Duration::ZERO,
549 total_regions: 0,
550 explain_verbose,
551 finished: false,
552 }
553 }
554
555 fn add_region_metrics(&mut self, region_metrics: RegionMetrics) {
556 self.total_poll_duration += region_metrics.poll_duration;
557 self.total_do_get_cost += region_metrics.do_get_cost;
558 self.total_regions += 1;
559 self.region_metrics.push(region_metrics);
560 }
561
562 fn finish(&mut self) {
564 if self.finished {
565 return;
566 }
567 self.finished = true;
568 self.log_metrics();
569 }
570
571 fn log_metrics(&self) {
573 if self.explain_verbose {
574 common_telemetry::info!(
575 "MergeScan partition {} finished: {} regions, total_poll_duration: {:?}, total_do_get_cost: {:?}",
576 self.partition,
577 self.total_regions,
578 self.total_poll_duration,
579 self.total_do_get_cost
580 );
581 } else {
582 common_telemetry::debug!(
583 "MergeScan partition {} finished: {} regions, total_poll_duration: {:?}, total_do_get_cost: {:?}",
584 self.partition,
585 self.total_regions,
586 self.total_poll_duration,
587 self.total_do_get_cost
588 );
589 }
590 }
591}
592
593impl Drop for PartitionMetrics {
594 fn drop(&mut self) {
595 if !self.finished {
596 self.log_metrics();
597 }
598 }
599}
600
601impl ExecutionPlan for MergeScanExec {
602 fn as_any(&self) -> &dyn Any {
603 self
604 }
605
606 fn schema(&self) -> ArrowSchemaRef {
607 self.arrow_schema.clone()
608 }
609
610 fn properties(&self) -> &Arc<PlanProperties> {
611 &self.properties
612 }
613
614 fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
615 vec![]
616 }
617
618 fn with_new_children(
621 self: Arc<Self>,
622 _children: Vec<Arc<dyn ExecutionPlan>>,
623 ) -> Result<Arc<dyn ExecutionPlan>> {
624 Ok(self.clone())
625 }
626
627 fn execute(
628 &self,
629 partition: usize,
630 context: Arc<TaskContext>,
631 ) -> Result<SendableRecordBatchStream> {
632 self.to_stream(context, partition)
633 }
634
635 fn metrics(&self) -> Option<MetricsSet> {
636 Some(self.metric.clone_inner())
637 }
638
639 fn name(&self) -> &str {
640 "MergeScanExec"
641 }
642}
643
644impl DisplayAs for MergeScanExec {
645 fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
646 write!(f, "MergeScanExec: peers=[")?;
647 for region_id in self.regions.iter() {
648 write!(f, "{}, ", region_id)?;
649 }
650 write!(f, "]")?;
651
652 if matches!(t, DisplayFormatType::Verbose) {
653 let partition_metrics = self.partition_metrics();
654 if !partition_metrics.is_empty() {
655 write!(f, ", metrics={{")?;
656 for (i, pm) in partition_metrics.iter().enumerate() {
657 if i > 0 {
658 write!(f, ", ")?;
659 }
660 write!(
661 f,
662 "\"partition_{}\":{{\"regions\":{},\"total_poll_duration\":\"{:?}\",\"total_do_get_cost\":\"{:?}\",\"region_metrics\":[",
663 pm.partition,
664 pm.total_regions,
665 pm.total_poll_duration,
666 pm.total_do_get_cost
667 )?;
668 for (j, rm) in pm.region_metrics.iter().enumerate() {
669 if j > 0 {
670 write!(f, ",")?;
671 }
672 write!(
673 f,
674 "{{\"region_id\":\"{}\",\"poll_duration\":\"{:?}\",\"do_get_cost\":\"{:?}\",\"total_cost\":\"{:?}\"}}",
675 rm.region_id, rm.poll_duration, rm.do_get_cost, rm.total_cost
676 )?;
677 }
678 write!(f, "]}}")?;
679 }
680 write!(f, "}}")?;
681 }
682 }
683
684 Ok(())
685 }
686}
687
688#[derive(Debug, Clone)]
689struct MergeScanMetric {
690 ready_time: Time,
692 first_consume_time: Time,
694 finish_time: Time,
696 output_rows: Count,
698
699 greptime_exec_cost: Gauge,
701}
702
703impl MergeScanMetric {
704 pub fn new(metric: &ExecutionPlanMetricsSet) -> Self {
705 Self {
706 ready_time: MetricBuilder::new(metric).subset_time("ready_time", 1),
707 first_consume_time: MetricBuilder::new(metric).subset_time("first_consume_time", 1),
708 finish_time: MetricBuilder::new(metric).subset_time("finish_time", 1),
709 output_rows: MetricBuilder::new(metric).output_rows(1),
710 greptime_exec_cost: MetricBuilder::new(metric).gauge(GREPTIME_EXEC_READ_COST, 1),
711 }
712 }
713
714 pub fn ready_time(&self) -> &Time {
715 &self.ready_time
716 }
717
718 pub fn first_consume_time(&self) -> &Time {
719 &self.first_consume_time
720 }
721
722 pub fn finish_time(&self) -> &Time {
723 &self.finish_time
724 }
725
726 pub fn record_output_batch_rows(&self, num_rows: usize) {
727 self.output_rows.add(num_rows);
728 }
729
730 pub fn record_greptime_exec_cost(&self, metrics: usize) {
731 self.greptime_exec_cost.add(metrics);
732 }
733}