1use std::sync::Arc;
16
17use arrow_schema::DataType;
18use datafusion::physical_optimizer::PhysicalOptimizerRule;
19use datafusion::physical_plan::ExecutionPlan;
20use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
21use datafusion::physical_plan::coop::CooperativeExec;
22use datafusion::physical_plan::filter::FilterExec;
23use datafusion::physical_plan::projection::ProjectionExec;
24use datafusion::physical_plan::repartition::RepartitionExec;
25use datafusion::physical_plan::sorts::sort::SortExec;
26use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
27use datafusion_common::Result as DataFusionResult;
28use datafusion_common::tree_node::{Transformed, TreeNode};
29use datafusion_physical_expr::expressions::{CastExpr, Column as PhysicalColumn};
30use datafusion_physical_expr::{PhysicalExpr, ScalarFunctionExpr};
31use store_api::region_engine::PartitionRange;
32use table::table::scan::RegionScanExec;
33
34use crate::part_sort::PartSortExec;
35use crate::window_sort::WindowedSortExec;
36
37#[derive(Debug)]
47pub struct WindowedSortPhysicalRule;
48
49impl PhysicalOptimizerRule for WindowedSortPhysicalRule {
50 fn optimize(
51 &self,
52 plan: Arc<dyn ExecutionPlan>,
53 config: &datafusion::config::ConfigOptions,
54 ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
55 Self::do_optimize(plan, config)
56 }
57
58 fn name(&self) -> &str {
59 "WindowedSortRule"
60 }
61
62 fn schema_check(&self) -> bool {
63 false
64 }
65}
66
67impl WindowedSortPhysicalRule {
68 fn do_optimize(
69 plan: Arc<dyn ExecutionPlan>,
70 _config: &datafusion::config::ConfigOptions,
71 ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
72 let result = plan
73 .transform_down(|plan| {
74 if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
75 if sort_exec.expr().len() != 1 {
77 return Ok(Transformed::no(plan));
78 }
79
80 let preserve_partitioning = sort_exec.preserve_partitioning();
81
82 let sort_input = remove_repartition(sort_exec.input().clone())?.data;
83
84 let Some(scanner_info) = fetch_partition_range(sort_input.clone())? else {
86 return Ok(Transformed::no(plan));
87 };
88 let input_schema = sort_input.schema();
89
90 let first_sort_expr = sort_exec.expr().first();
91 if let Some(column_expr) = first_sort_expr
92 .expr
93 .as_any()
94 .downcast_ref::<PhysicalColumn>()
95 && matches!(
96 input_schema.field(column_expr.index()).data_type(),
97 DataType::Timestamp(_, _)
98 )
99 && is_time_index_expr(&sort_input, &first_sort_expr.expr)?
100 && sort_exec.fetch().is_none()
101 {
103 } else {
104 return Ok(Transformed::no(plan));
105 }
106
107 let new_input = if scanner_info.tag_columns.is_empty()
111 && !first_sort_expr.options.descending
112 {
113 sort_input
114 } else {
115 Arc::new(PartSortExec::try_new(
116 first_sort_expr.clone(),
117 sort_exec.fetch(),
118 scanner_info.partition_ranges.clone(),
119 sort_input,
120 )?)
121 };
122
123 let windowed_sort_exec = WindowedSortExec::try_new(
124 first_sort_expr.clone(),
125 sort_exec.fetch(),
126 scanner_info.partition_ranges,
127 new_input,
128 )?;
129
130 if !preserve_partitioning {
131 let order_preserving_merge = SortPreservingMergeExec::new(
132 sort_exec.expr().clone(),
133 Arc::new(windowed_sort_exec),
134 );
135 return Ok(Transformed {
136 data: Arc::new(order_preserving_merge),
137 transformed: true,
138 tnr: datafusion_common::tree_node::TreeNodeRecursion::Stop,
139 });
140 } else {
141 return Ok(Transformed {
142 data: Arc::new(windowed_sort_exec),
143 transformed: true,
144 tnr: datafusion_common::tree_node::TreeNodeRecursion::Stop,
145 });
146 }
147 }
148
149 Ok(Transformed::no(plan))
150 })?
151 .data;
152
153 Ok(result)
154 }
155}
156
157#[derive(Debug)]
158struct ScannerInfo {
159 partition_ranges: Vec<Vec<PartitionRange>>,
160 tag_columns: Vec<String>,
161}
162
163fn fetch_partition_range(input: Arc<dyn ExecutionPlan>) -> DataFusionResult<Option<ScannerInfo>> {
164 let mut partition_ranges = None;
165 let mut tag_columns = None;
166
167 input.transform_up(|plan| {
168 if plan.as_any().is::<CooperativeExec>() {
169 return Ok(Transformed::no(plan));
170 }
171
172 if plan.as_any().is::<RepartitionExec>()
174 || plan.as_any().is::<CoalescePartitionsExec>()
175 || plan.as_any().is::<SortExec>()
176 || plan.as_any().is::<WindowedSortExec>()
177 {
178 partition_ranges = None;
179 }
180
181 if !(plan.as_any().is::<ProjectionExec>() || plan.as_any().is::<FilterExec>()) {
184 partition_ranges = None;
185 }
186
187 if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
188 if region_scan_exec.distribution()
190 == Some(store_api::storage::TimeSeriesDistribution::PerSeries)
191 {
192 partition_ranges = None;
193 return Ok(Transformed::no(plan));
194 }
195
196 partition_ranges = Some(region_scan_exec.get_uncollapsed_partition_ranges());
197 tag_columns = Some(region_scan_exec.tag_columns());
198
199 region_scan_exec.with_distinguish_partition_range(true);
200 }
201
202 Ok(Transformed::no(plan))
203 })?;
204
205 let result = try {
206 ScannerInfo {
207 partition_ranges: partition_ranges?,
208 tag_columns: tag_columns?,
209 }
210 };
211
212 Ok(result)
213}
214
215fn is_time_index_expr(
216 plan: &Arc<dyn ExecutionPlan>,
217 expr: &Arc<dyn PhysicalExpr>,
218) -> DataFusionResult<bool> {
219 if let Some(column_expr) = expr.as_any().downcast_ref::<PhysicalColumn>() {
220 return is_time_index_column(plan, column_expr);
221 }
222
223 if let Some(cast_expr) = expr.as_any().downcast_ref::<CastExpr>() {
224 return if matches!(cast_expr.cast_type(), DataType::Timestamp(_, _)) {
225 is_time_index_expr(plan, cast_expr.expr())
226 } else {
227 Ok(false)
228 };
229 }
230
231 if let Some(scalar_function_expr) = expr.as_any().downcast_ref::<ScalarFunctionExpr>() {
232 return if is_supported_time_index_wrapper(scalar_function_expr)
233 && scalar_function_expr.args().len() == 1
234 {
235 is_time_index_expr(plan, &scalar_function_expr.args()[0])
236 } else {
237 Ok(false)
238 };
239 }
240
241 Ok(false)
242}
243
244fn is_time_index_column(
245 plan: &Arc<dyn ExecutionPlan>,
246 column_expr: &PhysicalColumn,
247) -> DataFusionResult<bool> {
248 if let Some(projection) = plan.as_any().downcast_ref::<ProjectionExec>() {
249 let Some(projection_expr) = projection.expr().get(column_expr.index()) else {
250 return Ok(false);
251 };
252 return is_time_index_expr(projection.input(), &projection_expr.expr);
253 }
254
255 if let Some(filter) = plan.as_any().downcast_ref::<FilterExec>() {
256 let child_column_expr = filter
257 .projection()
258 .as_ref()
259 .and_then(|projection| projection.get(column_expr.index()).copied())
260 .map(|input_index| {
261 PhysicalColumn::new(
262 filter.input().schema().field(input_index).name(),
263 input_index,
264 )
265 })
266 .unwrap_or_else(|| column_expr.clone());
267 let child_expr = Arc::new(child_column_expr) as Arc<dyn PhysicalExpr>;
268 return is_time_index_expr(filter.input(), &child_expr);
269 }
270
271 if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
272 let schema = plan.schema();
273 let column_field = schema.field(column_expr.index());
274 return Ok(
275 matches!(column_field.data_type(), DataType::Timestamp(_, _))
276 && *column_field.name() == region_scan_exec.time_index(),
277 );
278 }
279
280 let Some(child) = passthrough_child(plan.as_ref()) else {
281 return Ok(false);
282 };
283 let child_expr = Arc::new(column_expr.clone()) as Arc<dyn PhysicalExpr>;
284 is_time_index_expr(&child, &child_expr)
285}
286
287fn passthrough_child(plan: &dyn ExecutionPlan) -> Option<Arc<dyn ExecutionPlan>> {
288 if plan.as_any().is::<CoalescePartitionsExec>()
289 || plan.as_any().is::<RepartitionExec>()
290 || plan.as_any().is::<CooperativeExec>()
291 {
292 return schema_preserving_child(plan);
293 }
294
295 None
296}
297
298fn schema_preserving_child(plan: &dyn ExecutionPlan) -> Option<Arc<dyn ExecutionPlan>> {
299 let child = plan.children().first().cloned().cloned()?;
300 (plan.schema().as_ref() == child.schema().as_ref()).then_some(child)
301}
302
303fn is_supported_time_index_wrapper(expr: &ScalarFunctionExpr) -> bool {
304 (expr.name().eq_ignore_ascii_case("to_timestamp")
305 || expr.name().eq_ignore_ascii_case("to_timestamp_seconds")
306 || expr.name().eq_ignore_ascii_case("to_timestamp_millis")
307 || expr.name().eq_ignore_ascii_case("to_timestamp_micros")
308 || expr.name().eq_ignore_ascii_case("to_timestamp_nanos"))
309 && matches!(expr.return_type(), DataType::Timestamp(_, _))
310}
311
312fn remove_repartition(
314 plan: Arc<dyn ExecutionPlan>,
315) -> DataFusionResult<Transformed<Arc<dyn ExecutionPlan>>> {
316 plan.transform_down(|plan| {
317 if plan.as_any().is::<FilterExec>() {
318 let maybe_repartition = plan.children()[0];
320 if maybe_repartition.as_any().is::<RepartitionExec>() {
321 let maybe_scan = maybe_repartition.children()[0];
322 if maybe_scan.as_any().is::<RegionScanExec>() {
323 let new_filter = plan.clone().with_new_children(vec![maybe_scan.clone()])?;
324 return Ok(Transformed::yes(new_filter));
325 }
326 }
327 }
328
329 Ok(Transformed::no(plan))
330 })
331}
332
333#[cfg(test)]
334mod test {
335 use std::sync::Arc;
336
337 use api::v1::SemanticType;
338 use arrow_schema::{Field, TimeUnit};
339 use common_recordbatch::RecordBatches;
340 use datafusion::config::ConfigOptions;
341 use datafusion::physical_plan::filter::FilterExecBuilder;
342 use datafusion_common::ScalarValue;
343 use datafusion_functions::datetime::to_timestamp_millis;
344 use datafusion_physical_expr::expressions::{CastExpr, Literal};
345 use datatypes::data_type::ConcreteDataType;
346 use datatypes::schema::{ColumnSchema, Schema};
347 use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
348 use store_api::region_engine::SinglePartitionScanner;
349 use store_api::storage::{RegionId, ScanRequest};
350
351 use super::*;
352
353 #[test]
354 fn test_is_time_index_expr_tracks_aliases_through_projection() {
355 let scan = new_region_scan();
356 let projection = Arc::new(
357 ProjectionExec::try_new(
358 vec![(
359 Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>,
360 "alias_ts".to_string(),
361 )],
362 scan,
363 )
364 .unwrap(),
365 ) as Arc<dyn ExecutionPlan>;
366 let expr = Arc::new(PhysicalColumn::new("alias_ts", 0)) as Arc<dyn PhysicalExpr>;
367
368 assert!(is_time_index_expr(&projection, &expr).unwrap());
369 }
370
371 #[test]
372 fn test_is_time_index_expr_tracks_multi_level_aliases() {
373 let scan = new_region_scan();
374 let first_projection = Arc::new(
375 ProjectionExec::try_new(
376 vec![(
377 Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>,
378 "alias_1".to_string(),
379 )],
380 scan,
381 )
382 .unwrap(),
383 ) as Arc<dyn ExecutionPlan>;
384 let second_projection = Arc::new(
385 ProjectionExec::try_new(
386 vec![(
387 Arc::new(PhysicalColumn::new("alias_1", 0)) as Arc<dyn PhysicalExpr>,
388 "alias_2".to_string(),
389 )],
390 first_projection,
391 )
392 .unwrap(),
393 ) as Arc<dyn ExecutionPlan>;
394 let expr = Arc::new(PhysicalColumn::new("alias_2", 0)) as Arc<dyn PhysicalExpr>;
395
396 assert!(is_time_index_expr(&second_projection, &expr).unwrap());
397 }
398
399 #[test]
400 fn test_is_time_index_expr_tracks_wrapped_aliases_through_projection() {
401 let scan = new_region_scan();
402 let config = Arc::new(ConfigOptions::default());
403 let return_field = Arc::new(Field::new(
404 "ts",
405 DataType::Timestamp(TimeUnit::Millisecond, None),
406 false,
407 ));
408 let projection = Arc::new(
409 ProjectionExec::try_new(
410 vec![(
411 Arc::new(ScalarFunctionExpr::new(
412 "to_timestamp_millis",
413 to_timestamp_millis(config.as_ref()),
414 vec![Arc::new(PhysicalColumn::new("ts", 1))],
415 return_field,
416 config,
417 )) as Arc<dyn PhysicalExpr>,
418 "ts".to_string(),
419 )],
420 scan,
421 )
422 .unwrap(),
423 ) as Arc<dyn ExecutionPlan>;
424 let expr = Arc::new(PhysicalColumn::new("ts", 0)) as Arc<dyn PhysicalExpr>;
425
426 assert!(is_time_index_expr(&projection, &expr).unwrap());
427 }
428
429 #[test]
430 fn test_is_time_index_expr_tracks_cast_aliases_through_projection() {
431 let scan = new_region_scan();
432 let projection = Arc::new(
433 ProjectionExec::try_new(
434 vec![(
435 Arc::new(CastExpr::new(
436 Arc::new(PhysicalColumn::new("ts", 1)),
437 DataType::Timestamp(TimeUnit::Millisecond, None),
438 None,
439 )) as Arc<dyn PhysicalExpr>,
440 "ts_ms".to_string(),
441 )],
442 scan,
443 )
444 .unwrap(),
445 ) as Arc<dyn ExecutionPlan>;
446 let expr = Arc::new(PhysicalColumn::new("ts_ms", 0)) as Arc<dyn PhysicalExpr>;
447
448 assert!(is_time_index_expr(&projection, &expr).unwrap());
449 }
450
451 #[test]
452 fn test_is_time_index_expr_rejects_unsupported_wrappers() {
453 let scan = new_region_scan();
454 let config = Arc::new(ConfigOptions::default());
455 let return_field = Arc::new(Field::new(
456 "ts",
457 DataType::Timestamp(TimeUnit::Millisecond, None),
458 false,
459 ));
460 let projection = Arc::new(
461 ProjectionExec::try_new(
462 vec![(
463 Arc::new(ScalarFunctionExpr::new(
464 "date_trunc",
465 to_timestamp_millis(config.as_ref()),
466 vec![Arc::new(PhysicalColumn::new("ts", 1))],
467 return_field,
468 config,
469 )) as Arc<dyn PhysicalExpr>,
470 "ts".to_string(),
471 )],
472 scan,
473 )
474 .unwrap(),
475 ) as Arc<dyn ExecutionPlan>;
476 let expr = Arc::new(PhysicalColumn::new("ts", 0)) as Arc<dyn PhysicalExpr>;
477
478 assert!(!is_time_index_expr(&projection, &expr).unwrap());
479 }
480
481 #[test]
482 fn test_is_supported_time_index_wrapper_ignores_function_name_case() {
483 let config = Arc::new(ConfigOptions::default());
484 let return_field = Arc::new(Field::new(
485 "ts",
486 DataType::Timestamp(TimeUnit::Millisecond, None),
487 false,
488 ));
489 let expr = ScalarFunctionExpr::new(
490 "To_Timestamp_Millis",
491 to_timestamp_millis(config.as_ref()),
492 vec![Arc::new(PhysicalColumn::new("ts", 1))],
493 return_field,
494 config,
495 );
496
497 assert!(is_supported_time_index_wrapper(&expr));
498 }
499
500 #[test]
501 fn test_is_time_index_expr_rejects_non_timestamp_casts() {
502 let scan = new_region_scan();
503 let cast_expr = Arc::new(CastExpr::new(
504 Arc::new(PhysicalColumn::new("ts", 1)),
505 DataType::Timestamp(TimeUnit::Millisecond, None),
506 None,
507 )) as Arc<dyn PhysicalExpr>;
508 assert!(is_time_index_expr(&scan, &cast_expr).unwrap());
509
510 let non_timestamp_cast = Arc::new(CastExpr::new(
511 Arc::new(PhysicalColumn::new("ts", 1)),
512 DataType::Int64,
513 None,
514 )) as Arc<dyn PhysicalExpr>;
515 assert!(!is_time_index_expr(&scan, &non_timestamp_cast).unwrap());
516 }
517
518 #[test]
519 fn test_is_time_index_expr_tracks_time_index_through_filter() {
520 let scan = new_region_scan();
521 let filter = Arc::new(
522 FilterExec::try_new(
523 Arc::new(Literal::new(ScalarValue::Boolean(Some(true)))),
524 scan,
525 )
526 .unwrap(),
527 ) as Arc<dyn ExecutionPlan>;
528 let expr = Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>;
529
530 assert!(is_time_index_expr(&filter, &expr).unwrap());
531 }
532
533 #[test]
534 fn test_is_time_index_expr_tracks_time_index_through_passthrough_wrapper_and_filter_projection()
535 {
536 let scan = new_region_scan();
537 let projected_filter = Arc::new(
538 FilterExecBuilder::new(
539 Arc::new(Literal::new(ScalarValue::Boolean(Some(true)))),
540 scan,
541 )
542 .apply_projection(Some(vec![1]))
543 .unwrap()
544 .build()
545 .unwrap(),
546 ) as Arc<dyn ExecutionPlan>;
547 let cooperative =
548 Arc::new(CooperativeExec::new(projected_filter)) as Arc<dyn ExecutionPlan>;
549 let expr = Arc::new(PhysicalColumn::new("ts", 0)) as Arc<dyn PhysicalExpr>;
550
551 assert!(is_time_index_expr(&cooperative, &expr).unwrap());
552 }
553
554 #[test]
555 fn test_schema_preserving_child_rejects_schema_changing_projection() {
556 let scan = new_region_scan();
557 let projection = ProjectionExec::try_new(
558 vec![(
559 Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>,
560 "ts".to_string(),
561 )],
562 scan,
563 )
564 .unwrap();
565
566 assert!(schema_preserving_child(&projection).is_none());
567 }
568
569 #[test]
570 fn test_cooperative_exec_satisfies_passthrough_schema_contract() {
571 let child = new_region_scan();
572 let plan = Arc::new(CooperativeExec::new(child.clone())) as Arc<dyn ExecutionPlan>;
573
574 assert_passthrough_schema_contract(plan, child);
575 }
576
577 #[test]
578 fn test_repartition_exec_satisfies_passthrough_schema_contract() {
579 let child = new_region_scan();
580 let plan = Arc::new(
581 RepartitionExec::try_new(
582 child.clone(),
583 datafusion_physical_expr::Partitioning::RoundRobinBatch(2),
584 )
585 .unwrap(),
586 ) as Arc<dyn ExecutionPlan>;
587
588 assert_passthrough_schema_contract(plan, child);
589 }
590
591 #[test]
592 fn test_coalesce_partitions_exec_satisfies_passthrough_schema_contract() {
593 let child = new_region_scan();
594 let plan = Arc::new(CoalescePartitionsExec::new(child.clone())) as Arc<dyn ExecutionPlan>;
595
596 assert_passthrough_schema_contract(plan, child);
597 }
598
599 fn assert_passthrough_schema_contract(
600 plan: Arc<dyn ExecutionPlan>,
601 child: Arc<dyn ExecutionPlan>,
602 ) {
603 assert_eq!(plan.schema().as_ref(), child.schema().as_ref());
604
605 let passthrough = passthrough_child(plan.as_ref()).expect("wrapper should preserve schema");
606 assert_eq!(passthrough.schema().as_ref(), child.schema().as_ref());
607 }
608
609 fn new_region_scan() -> Arc<dyn ExecutionPlan> {
610 let schema = Arc::new(Schema::new(vec![
611 ColumnSchema::new("value", ConcreteDataType::int32_datatype(), false),
612 ColumnSchema::new(
613 "ts",
614 ConcreteDataType::timestamp_nanosecond_datatype(),
615 false,
616 ),
617 ]));
618 let recordbatches = RecordBatches::try_new(schema.clone(), vec![]).unwrap();
619 let stream = recordbatches.as_stream();
620
621 let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
622 builder
623 .push_column_metadata(ColumnMetadata {
624 column_schema: ColumnSchema::new(
625 "value",
626 ConcreteDataType::int32_datatype(),
627 false,
628 ),
629 semantic_type: SemanticType::Field,
630 column_id: 1,
631 })
632 .push_column_metadata(ColumnMetadata {
633 column_schema: ColumnSchema::new(
634 "ts",
635 ConcreteDataType::timestamp_nanosecond_datatype(),
636 false,
637 ),
638 semantic_type: SemanticType::Timestamp,
639 column_id: 2,
640 });
641
642 let scanner = Box::new(SinglePartitionScanner::new(
643 stream,
644 false,
645 Arc::new(builder.build().unwrap()),
646 None,
647 ));
648 Arc::new(RegionScanExec::new(scanner, ScanRequest::default(), None).unwrap())
649 }
650}