Skip to main content

query/optimizer/
windowed_sort.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow_schema::DataType;
18use datafusion::physical_optimizer::PhysicalOptimizerRule;
19use datafusion::physical_plan::ExecutionPlan;
20use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
21use datafusion::physical_plan::coop::CooperativeExec;
22use datafusion::physical_plan::filter::FilterExec;
23use datafusion::physical_plan::projection::ProjectionExec;
24use datafusion::physical_plan::repartition::RepartitionExec;
25use datafusion::physical_plan::sorts::sort::SortExec;
26use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
27use datafusion_common::Result as DataFusionResult;
28use datafusion_common::tree_node::{Transformed, TreeNode};
29use datafusion_physical_expr::expressions::{CastExpr, Column as PhysicalColumn};
30use datafusion_physical_expr::{PhysicalExpr, ScalarFunctionExpr};
31use store_api::region_engine::PartitionRange;
32use table::table::scan::RegionScanExec;
33
34use crate::part_sort::PartSortExec;
35use crate::window_sort::WindowedSortExec;
36
37/// Optimize rule for windowed sort.
38///
39/// This is expected to run after [`ScanHint`] and [`ParallelizeScan`].
40/// It would change the original sort to a custom plan. To make sure
41/// other rules are applied correctly, this rule can be run as later as
42/// possible.
43///
44/// [`ScanHint`]: crate::optimizer::scan_hint::ScanHintRule
45/// [`ParallelizeScan`]: crate::optimizer::parallelize_scan::ParallelizeScan
46#[derive(Debug)]
47pub struct WindowedSortPhysicalRule;
48
49impl PhysicalOptimizerRule for WindowedSortPhysicalRule {
50    fn optimize(
51        &self,
52        plan: Arc<dyn ExecutionPlan>,
53        config: &datafusion::config::ConfigOptions,
54    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
55        Self::do_optimize(plan, config)
56    }
57
58    fn name(&self) -> &str {
59        "WindowedSortRule"
60    }
61
62    fn schema_check(&self) -> bool {
63        false
64    }
65}
66
67impl WindowedSortPhysicalRule {
68    fn do_optimize(
69        plan: Arc<dyn ExecutionPlan>,
70        _config: &datafusion::config::ConfigOptions,
71    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
72        let result = plan
73            .transform_down(|plan| {
74                if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
75                    // TODO: support multiple expr in windowed sort
76                    if sort_exec.expr().len() != 1 {
77                        return Ok(Transformed::no(plan));
78                    }
79
80                    let preserve_partitioning = sort_exec.preserve_partitioning();
81
82                    let sort_input = remove_repartition(sort_exec.input().clone())?.data;
83
84                    // Gets scanner info from the input without repartition before filter.
85                    let Some(scanner_info) = fetch_partition_range(sort_input.clone())? else {
86                        return Ok(Transformed::no(plan));
87                    };
88                    let input_schema = sort_input.schema();
89
90                    let first_sort_expr = sort_exec.expr().first();
91                    if let Some(column_expr) = first_sort_expr
92                        .expr
93                        .as_any()
94                        .downcast_ref::<PhysicalColumn>()
95                        && matches!(
96                            input_schema.field(column_expr.index()).data_type(),
97                            DataType::Timestamp(_, _)
98                        )
99                        && is_time_index_expr(&sort_input, &first_sort_expr.expr)?
100                        && sort_exec.fetch().is_none()
101                    // skip if there is a limit, as dyn filter along is good enough in this case
102                    {
103                    } else {
104                        return Ok(Transformed::no(plan));
105                    }
106
107                    // PartSortExec is unnecessary if:
108                    // - there is no tag column, and
109                    // - the sort is ascending on the time index column
110                    let new_input = if scanner_info.tag_columns.is_empty()
111                        && !first_sort_expr.options.descending
112                    {
113                        sort_input
114                    } else {
115                        Arc::new(PartSortExec::try_new(
116                            first_sort_expr.clone(),
117                            sort_exec.fetch(),
118                            scanner_info.partition_ranges.clone(),
119                            sort_input,
120                        )?)
121                    };
122
123                    let windowed_sort_exec = WindowedSortExec::try_new(
124                        first_sort_expr.clone(),
125                        sort_exec.fetch(),
126                        scanner_info.partition_ranges,
127                        new_input,
128                    )?;
129
130                    if !preserve_partitioning {
131                        let order_preserving_merge = SortPreservingMergeExec::new(
132                            sort_exec.expr().clone(),
133                            Arc::new(windowed_sort_exec),
134                        );
135                        return Ok(Transformed {
136                            data: Arc::new(order_preserving_merge),
137                            transformed: true,
138                            tnr: datafusion_common::tree_node::TreeNodeRecursion::Stop,
139                        });
140                    } else {
141                        return Ok(Transformed {
142                            data: Arc::new(windowed_sort_exec),
143                            transformed: true,
144                            tnr: datafusion_common::tree_node::TreeNodeRecursion::Stop,
145                        });
146                    }
147                }
148
149                Ok(Transformed::no(plan))
150            })?
151            .data;
152
153        Ok(result)
154    }
155}
156
157#[derive(Debug)]
158struct ScannerInfo {
159    partition_ranges: Vec<Vec<PartitionRange>>,
160    tag_columns: Vec<String>,
161}
162
163fn fetch_partition_range(input: Arc<dyn ExecutionPlan>) -> DataFusionResult<Option<ScannerInfo>> {
164    let mut partition_ranges = None;
165    let mut tag_columns = None;
166
167    input.transform_up(|plan| {
168        if plan.as_any().is::<CooperativeExec>() {
169            return Ok(Transformed::no(plan));
170        }
171
172        // Unappliable case, reset the state.
173        if plan.as_any().is::<RepartitionExec>()
174            || plan.as_any().is::<CoalescePartitionsExec>()
175            || plan.as_any().is::<SortExec>()
176            || plan.as_any().is::<WindowedSortExec>()
177        {
178            partition_ranges = None;
179        }
180
181        // only a very limited set of plans can exist between region scan and sort exec
182        // other plans might make this optimize wrong, so be safe here by limiting it
183        if !(plan.as_any().is::<ProjectionExec>() || plan.as_any().is::<FilterExec>()) {
184            partition_ranges = None;
185        }
186
187        if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
188            // `PerSeries` distribution is not supported in windowed sort.
189            if region_scan_exec.distribution()
190                == Some(store_api::storage::TimeSeriesDistribution::PerSeries)
191            {
192                partition_ranges = None;
193                return Ok(Transformed::no(plan));
194            }
195
196            partition_ranges = Some(region_scan_exec.get_uncollapsed_partition_ranges());
197            tag_columns = Some(region_scan_exec.tag_columns());
198
199            region_scan_exec.with_distinguish_partition_range(true);
200        }
201
202        Ok(Transformed::no(plan))
203    })?;
204
205    let result = try {
206        ScannerInfo {
207            partition_ranges: partition_ranges?,
208            tag_columns: tag_columns?,
209        }
210    };
211
212    Ok(result)
213}
214
215fn is_time_index_expr(
216    plan: &Arc<dyn ExecutionPlan>,
217    expr: &Arc<dyn PhysicalExpr>,
218) -> DataFusionResult<bool> {
219    if let Some(column_expr) = expr.as_any().downcast_ref::<PhysicalColumn>() {
220        return is_time_index_column(plan, column_expr);
221    }
222
223    if let Some(cast_expr) = expr.as_any().downcast_ref::<CastExpr>() {
224        return if matches!(cast_expr.cast_type(), DataType::Timestamp(_, _)) {
225            is_time_index_expr(plan, cast_expr.expr())
226        } else {
227            Ok(false)
228        };
229    }
230
231    if let Some(scalar_function_expr) = expr.as_any().downcast_ref::<ScalarFunctionExpr>() {
232        return if is_supported_time_index_wrapper(scalar_function_expr)
233            && scalar_function_expr.args().len() == 1
234        {
235            is_time_index_expr(plan, &scalar_function_expr.args()[0])
236        } else {
237            Ok(false)
238        };
239    }
240
241    Ok(false)
242}
243
244fn is_time_index_column(
245    plan: &Arc<dyn ExecutionPlan>,
246    column_expr: &PhysicalColumn,
247) -> DataFusionResult<bool> {
248    if let Some(projection) = plan.as_any().downcast_ref::<ProjectionExec>() {
249        let Some(projection_expr) = projection.expr().get(column_expr.index()) else {
250            return Ok(false);
251        };
252        return is_time_index_expr(projection.input(), &projection_expr.expr);
253    }
254
255    if let Some(filter) = plan.as_any().downcast_ref::<FilterExec>() {
256        let child_column_expr = filter
257            .projection()
258            .as_ref()
259            .and_then(|projection| projection.get(column_expr.index()).copied())
260            .map(|input_index| {
261                PhysicalColumn::new(
262                    filter.input().schema().field(input_index).name(),
263                    input_index,
264                )
265            })
266            .unwrap_or_else(|| column_expr.clone());
267        let child_expr = Arc::new(child_column_expr) as Arc<dyn PhysicalExpr>;
268        return is_time_index_expr(filter.input(), &child_expr);
269    }
270
271    if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
272        let schema = plan.schema();
273        let column_field = schema.field(column_expr.index());
274        return Ok(
275            matches!(column_field.data_type(), DataType::Timestamp(_, _))
276                && *column_field.name() == region_scan_exec.time_index(),
277        );
278    }
279
280    let Some(child) = passthrough_child(plan.as_ref()) else {
281        return Ok(false);
282    };
283    let child_expr = Arc::new(column_expr.clone()) as Arc<dyn PhysicalExpr>;
284    is_time_index_expr(&child, &child_expr)
285}
286
287fn passthrough_child(plan: &dyn ExecutionPlan) -> Option<Arc<dyn ExecutionPlan>> {
288    if plan.as_any().is::<CoalescePartitionsExec>()
289        || plan.as_any().is::<RepartitionExec>()
290        || plan.as_any().is::<CooperativeExec>()
291    {
292        return schema_preserving_child(plan);
293    }
294
295    None
296}
297
298fn schema_preserving_child(plan: &dyn ExecutionPlan) -> Option<Arc<dyn ExecutionPlan>> {
299    let child = plan.children().first().cloned().cloned()?;
300    (plan.schema().as_ref() == child.schema().as_ref()).then_some(child)
301}
302
303fn is_supported_time_index_wrapper(expr: &ScalarFunctionExpr) -> bool {
304    (expr.name().eq_ignore_ascii_case("to_timestamp")
305        || expr.name().eq_ignore_ascii_case("to_timestamp_seconds")
306        || expr.name().eq_ignore_ascii_case("to_timestamp_millis")
307        || expr.name().eq_ignore_ascii_case("to_timestamp_micros")
308        || expr.name().eq_ignore_ascii_case("to_timestamp_nanos"))
309        && matches!(expr.return_type(), DataType::Timestamp(_, _))
310}
311
312/// Removes the repartition plan between the filter and region scan.
313fn remove_repartition(
314    plan: Arc<dyn ExecutionPlan>,
315) -> DataFusionResult<Transformed<Arc<dyn ExecutionPlan>>> {
316    plan.transform_down(|plan| {
317        if plan.as_any().is::<FilterExec>() {
318            // Checks child.
319            let maybe_repartition = plan.children()[0];
320            if maybe_repartition.as_any().is::<RepartitionExec>() {
321                let maybe_scan = maybe_repartition.children()[0];
322                if maybe_scan.as_any().is::<RegionScanExec>() {
323                    let new_filter = plan.clone().with_new_children(vec![maybe_scan.clone()])?;
324                    return Ok(Transformed::yes(new_filter));
325                }
326            }
327        }
328
329        Ok(Transformed::no(plan))
330    })
331}
332
333#[cfg(test)]
334mod test {
335    use std::sync::Arc;
336
337    use api::v1::SemanticType;
338    use arrow_schema::{Field, TimeUnit};
339    use common_recordbatch::RecordBatches;
340    use datafusion::config::ConfigOptions;
341    use datafusion::physical_plan::filter::FilterExecBuilder;
342    use datafusion_common::ScalarValue;
343    use datafusion_functions::datetime::to_timestamp_millis;
344    use datafusion_physical_expr::expressions::{CastExpr, Literal};
345    use datatypes::data_type::ConcreteDataType;
346    use datatypes::schema::{ColumnSchema, Schema};
347    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
348    use store_api::region_engine::SinglePartitionScanner;
349    use store_api::storage::{RegionId, ScanRequest};
350
351    use super::*;
352
353    #[test]
354    fn test_is_time_index_expr_tracks_aliases_through_projection() {
355        let scan = new_region_scan();
356        let projection = Arc::new(
357            ProjectionExec::try_new(
358                vec![(
359                    Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>,
360                    "alias_ts".to_string(),
361                )],
362                scan,
363            )
364            .unwrap(),
365        ) as Arc<dyn ExecutionPlan>;
366        let expr = Arc::new(PhysicalColumn::new("alias_ts", 0)) as Arc<dyn PhysicalExpr>;
367
368        assert!(is_time_index_expr(&projection, &expr).unwrap());
369    }
370
371    #[test]
372    fn test_is_time_index_expr_tracks_multi_level_aliases() {
373        let scan = new_region_scan();
374        let first_projection = Arc::new(
375            ProjectionExec::try_new(
376                vec![(
377                    Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>,
378                    "alias_1".to_string(),
379                )],
380                scan,
381            )
382            .unwrap(),
383        ) as Arc<dyn ExecutionPlan>;
384        let second_projection = Arc::new(
385            ProjectionExec::try_new(
386                vec![(
387                    Arc::new(PhysicalColumn::new("alias_1", 0)) as Arc<dyn PhysicalExpr>,
388                    "alias_2".to_string(),
389                )],
390                first_projection,
391            )
392            .unwrap(),
393        ) as Arc<dyn ExecutionPlan>;
394        let expr = Arc::new(PhysicalColumn::new("alias_2", 0)) as Arc<dyn PhysicalExpr>;
395
396        assert!(is_time_index_expr(&second_projection, &expr).unwrap());
397    }
398
399    #[test]
400    fn test_is_time_index_expr_tracks_wrapped_aliases_through_projection() {
401        let scan = new_region_scan();
402        let config = Arc::new(ConfigOptions::default());
403        let return_field = Arc::new(Field::new(
404            "ts",
405            DataType::Timestamp(TimeUnit::Millisecond, None),
406            false,
407        ));
408        let projection = Arc::new(
409            ProjectionExec::try_new(
410                vec![(
411                    Arc::new(ScalarFunctionExpr::new(
412                        "to_timestamp_millis",
413                        to_timestamp_millis(config.as_ref()),
414                        vec![Arc::new(PhysicalColumn::new("ts", 1))],
415                        return_field,
416                        config,
417                    )) as Arc<dyn PhysicalExpr>,
418                    "ts".to_string(),
419                )],
420                scan,
421            )
422            .unwrap(),
423        ) as Arc<dyn ExecutionPlan>;
424        let expr = Arc::new(PhysicalColumn::new("ts", 0)) as Arc<dyn PhysicalExpr>;
425
426        assert!(is_time_index_expr(&projection, &expr).unwrap());
427    }
428
429    #[test]
430    fn test_is_time_index_expr_tracks_cast_aliases_through_projection() {
431        let scan = new_region_scan();
432        let projection = Arc::new(
433            ProjectionExec::try_new(
434                vec![(
435                    Arc::new(CastExpr::new(
436                        Arc::new(PhysicalColumn::new("ts", 1)),
437                        DataType::Timestamp(TimeUnit::Millisecond, None),
438                        None,
439                    )) as Arc<dyn PhysicalExpr>,
440                    "ts_ms".to_string(),
441                )],
442                scan,
443            )
444            .unwrap(),
445        ) as Arc<dyn ExecutionPlan>;
446        let expr = Arc::new(PhysicalColumn::new("ts_ms", 0)) as Arc<dyn PhysicalExpr>;
447
448        assert!(is_time_index_expr(&projection, &expr).unwrap());
449    }
450
451    #[test]
452    fn test_is_time_index_expr_rejects_unsupported_wrappers() {
453        let scan = new_region_scan();
454        let config = Arc::new(ConfigOptions::default());
455        let return_field = Arc::new(Field::new(
456            "ts",
457            DataType::Timestamp(TimeUnit::Millisecond, None),
458            false,
459        ));
460        let projection = Arc::new(
461            ProjectionExec::try_new(
462                vec![(
463                    Arc::new(ScalarFunctionExpr::new(
464                        "date_trunc",
465                        to_timestamp_millis(config.as_ref()),
466                        vec![Arc::new(PhysicalColumn::new("ts", 1))],
467                        return_field,
468                        config,
469                    )) as Arc<dyn PhysicalExpr>,
470                    "ts".to_string(),
471                )],
472                scan,
473            )
474            .unwrap(),
475        ) as Arc<dyn ExecutionPlan>;
476        let expr = Arc::new(PhysicalColumn::new("ts", 0)) as Arc<dyn PhysicalExpr>;
477
478        assert!(!is_time_index_expr(&projection, &expr).unwrap());
479    }
480
481    #[test]
482    fn test_is_supported_time_index_wrapper_ignores_function_name_case() {
483        let config = Arc::new(ConfigOptions::default());
484        let return_field = Arc::new(Field::new(
485            "ts",
486            DataType::Timestamp(TimeUnit::Millisecond, None),
487            false,
488        ));
489        let expr = ScalarFunctionExpr::new(
490            "To_Timestamp_Millis",
491            to_timestamp_millis(config.as_ref()),
492            vec![Arc::new(PhysicalColumn::new("ts", 1))],
493            return_field,
494            config,
495        );
496
497        assert!(is_supported_time_index_wrapper(&expr));
498    }
499
500    #[test]
501    fn test_is_time_index_expr_rejects_non_timestamp_casts() {
502        let scan = new_region_scan();
503        let cast_expr = Arc::new(CastExpr::new(
504            Arc::new(PhysicalColumn::new("ts", 1)),
505            DataType::Timestamp(TimeUnit::Millisecond, None),
506            None,
507        )) as Arc<dyn PhysicalExpr>;
508        assert!(is_time_index_expr(&scan, &cast_expr).unwrap());
509
510        let non_timestamp_cast = Arc::new(CastExpr::new(
511            Arc::new(PhysicalColumn::new("ts", 1)),
512            DataType::Int64,
513            None,
514        )) as Arc<dyn PhysicalExpr>;
515        assert!(!is_time_index_expr(&scan, &non_timestamp_cast).unwrap());
516    }
517
518    #[test]
519    fn test_is_time_index_expr_tracks_time_index_through_filter() {
520        let scan = new_region_scan();
521        let filter = Arc::new(
522            FilterExec::try_new(
523                Arc::new(Literal::new(ScalarValue::Boolean(Some(true)))),
524                scan,
525            )
526            .unwrap(),
527        ) as Arc<dyn ExecutionPlan>;
528        let expr = Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>;
529
530        assert!(is_time_index_expr(&filter, &expr).unwrap());
531    }
532
533    #[test]
534    fn test_is_time_index_expr_tracks_time_index_through_passthrough_wrapper_and_filter_projection()
535    {
536        let scan = new_region_scan();
537        let projected_filter = Arc::new(
538            FilterExecBuilder::new(
539                Arc::new(Literal::new(ScalarValue::Boolean(Some(true)))),
540                scan,
541            )
542            .apply_projection(Some(vec![1]))
543            .unwrap()
544            .build()
545            .unwrap(),
546        ) as Arc<dyn ExecutionPlan>;
547        let cooperative =
548            Arc::new(CooperativeExec::new(projected_filter)) as Arc<dyn ExecutionPlan>;
549        let expr = Arc::new(PhysicalColumn::new("ts", 0)) as Arc<dyn PhysicalExpr>;
550
551        assert!(is_time_index_expr(&cooperative, &expr).unwrap());
552    }
553
554    #[test]
555    fn test_schema_preserving_child_rejects_schema_changing_projection() {
556        let scan = new_region_scan();
557        let projection = ProjectionExec::try_new(
558            vec![(
559                Arc::new(PhysicalColumn::new("ts", 1)) as Arc<dyn PhysicalExpr>,
560                "ts".to_string(),
561            )],
562            scan,
563        )
564        .unwrap();
565
566        assert!(schema_preserving_child(&projection).is_none());
567    }
568
569    #[test]
570    fn test_cooperative_exec_satisfies_passthrough_schema_contract() {
571        let child = new_region_scan();
572        let plan = Arc::new(CooperativeExec::new(child.clone())) as Arc<dyn ExecutionPlan>;
573
574        assert_passthrough_schema_contract(plan, child);
575    }
576
577    #[test]
578    fn test_repartition_exec_satisfies_passthrough_schema_contract() {
579        let child = new_region_scan();
580        let plan = Arc::new(
581            RepartitionExec::try_new(
582                child.clone(),
583                datafusion_physical_expr::Partitioning::RoundRobinBatch(2),
584            )
585            .unwrap(),
586        ) as Arc<dyn ExecutionPlan>;
587
588        assert_passthrough_schema_contract(plan, child);
589    }
590
591    #[test]
592    fn test_coalesce_partitions_exec_satisfies_passthrough_schema_contract() {
593        let child = new_region_scan();
594        let plan = Arc::new(CoalescePartitionsExec::new(child.clone())) as Arc<dyn ExecutionPlan>;
595
596        assert_passthrough_schema_contract(plan, child);
597    }
598
599    fn assert_passthrough_schema_contract(
600        plan: Arc<dyn ExecutionPlan>,
601        child: Arc<dyn ExecutionPlan>,
602    ) {
603        assert_eq!(plan.schema().as_ref(), child.schema().as_ref());
604
605        let passthrough = passthrough_child(plan.as_ref()).expect("wrapper should preserve schema");
606        assert_eq!(passthrough.schema().as_ref(), child.schema().as_ref());
607    }
608
609    fn new_region_scan() -> Arc<dyn ExecutionPlan> {
610        let schema = Arc::new(Schema::new(vec![
611            ColumnSchema::new("value", ConcreteDataType::int32_datatype(), false),
612            ColumnSchema::new(
613                "ts",
614                ConcreteDataType::timestamp_nanosecond_datatype(),
615                false,
616            ),
617        ]));
618        let recordbatches = RecordBatches::try_new(schema.clone(), vec![]).unwrap();
619        let stream = recordbatches.as_stream();
620
621        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
622        builder
623            .push_column_metadata(ColumnMetadata {
624                column_schema: ColumnSchema::new(
625                    "value",
626                    ConcreteDataType::int32_datatype(),
627                    false,
628                ),
629                semantic_type: SemanticType::Field,
630                column_id: 1,
631            })
632            .push_column_metadata(ColumnMetadata {
633                column_schema: ColumnSchema::new(
634                    "ts",
635                    ConcreteDataType::timestamp_nanosecond_datatype(),
636                    false,
637                ),
638                semantic_type: SemanticType::Timestamp,
639                column_id: 2,
640            });
641
642        let scanner = Box::new(SinglePartitionScanner::new(
643            stream,
644            false,
645            Arc::new(builder.build().unwrap()),
646            None,
647        ));
648        Arc::new(RegionScanExec::new(scanner, ScanRequest::default(), None).unwrap())
649    }
650}