feat: optimizer rule for windowed sort (#4874)

* basic impl Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * implement physical rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * feat: install windowed sort physical rule and optimize partition ranges Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * add logs and sqlness test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * feat: introduce PartSortExec for partitioned sorting Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * tune exec nodes' properties and metrics Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix typo Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * debug: add more info on very wrong * debug: also print overlap ranges * feat: add check when emit PartSort Stream * dbg: info on overlap working range * feat: check batch range is inside part range * set distinguish partition range param Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * chore: more logs * update sqlness Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * tune optimizer Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * clean up Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix lints Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix windowed sort rule Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix: early terminate sort stream Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * chore: remove min/max check * chore: remove unused windowed_sort module, uuid feature and refactor region_scanner to synchronous Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * chore: print more fuzz log * chore: more log * fix: part sort should skip empty part * chore: remove insert logs * tests: empty PartitionRange * refactor: testcase * docs: update comment&tests: all empty * ci: enlarge etcd cpu limit --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com> Co-authored-by: discord9 <discord9@163.com> Co-authored-by: evenyag <realevenyag@gmail.com>
2025-12-25 23:49:58 +00:00 · 2024-10-29 15:46:05 +08:00
parent 0ee455a980
commit 03f2fa219d
21 changed files with 930 additions and 230 deletions
--- a/.github/actions/setup-etcd-cluster/action.yml
+++ b/.github/actions/setup-etcd-cluster/action.yml
@@ -18,7 +18,7 @@ runs:
        --set replicaCount=${{ inputs.etcd-replicas }} \
        --set resources.requests.cpu=50m \
        --set resources.requests.memory=128Mi \
-        --set resources.limits.cpu=1000m \
+        --set resources.limits.cpu=1500m \
        --set resources.limits.memory=2Gi \
        --set auth.rbac.create=false \
        --set auth.rbac.token.enabled=false \
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9030,6 +9030,7 @@ dependencies = [
 "table",
 "tokio",
 "tokio-stream",
+ "uuid",
 ]

 [[package]]
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -163,13 +163,13 @@ impl MitoEngine {
    }

    /// Returns a region scanner to scan the region for `request`.
-    async fn region_scanner(
+    fn region_scanner(
        &self,
        region_id: RegionId,
        request: ScanRequest,
    ) -> Result<RegionScannerRef> {
        let scanner = self.scanner(region_id, request)?;
-        scanner.region_scanner().await
+        scanner.region_scanner()
    }

    /// Scans a region.
@@ -527,7 +527,6 @@ impl RegionEngine for MitoEngine {
        request: ScanRequest,
    ) -> Result<RegionScannerRef, BoxedError> {
        self.region_scanner(region_id, request)
-            .await
            .map_err(BoxedError::new)
    }

--- a/src/mito2/src/read.rs
+++ b/src/mito2/src/read.rs
@@ -532,10 +532,24 @@ impl Batch {
 #[derive(Default)]
 pub(crate) struct BatchChecker {
    last_batch: Option<Batch>,
+    start: Option<Timestamp>,
+    end: Option<Timestamp>,
 }

 #[cfg(debug_assertions)]
 impl BatchChecker {
+    /// Attaches the given start timestamp to the checker.
+    pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
+        self.start = start;
+        self
+    }
+
+    /// Attaches the given end timestamp to the checker.
+    pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
+        self.end = end;
+        self
+    }
+
    /// Returns true if the given batch is monotonic and behind
    /// the last batch.
    pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> bool {
@@ -543,6 +557,17 @@ impl BatchChecker {
            return false;
        }

+        if let (Some(start), Some(first)) = (self.start, batch.first_timestamp()) {
+            if start > first {
+                return false;
+            }
+        }
+        if let (Some(end), Some(last)) = (self.end, batch.last_timestamp()) {
+            if end <= last {
+                return false;
+            }
+        }
+
        // Checks the batch is behind the last batch.
        // Then Updates the last batch.
        let is_behind = self
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -74,7 +74,7 @@ impl Scanner {

    /// Returns a [RegionScanner] to scan the region.
    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
-    pub(crate) async fn region_scanner(self) -> Result<RegionScannerRef> {
+    pub(crate) fn region_scanner(self) -> Result<RegionScannerRef> {
        match self {
            Scanner::Seq(seq_scan) => Ok(Box::new(seq_scan)),
            Scanner::Unordered(unordered_scan) => Ok(Box::new(unordered_scan)),
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -232,7 +232,10 @@ impl SeqScan {
                let mut metrics = ScannerMetrics::default();
                let mut fetch_start = Instant::now();
                #[cfg(debug_assertions)]
-                let mut checker = crate::read::BatchChecker::default();
+                let mut checker = crate::read::BatchChecker::default()
+                    .with_start(Some(part_range.start))
+                    .with_end(Some(part_range.end));
+
                while let Some(batch) = reader
                    .next_batch()
                    .await
@@ -304,8 +307,13 @@ impl RegionScanner for SeqScan {
        self.scan_partition_impl(partition)
    }

-    fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
+    fn prepare(
+        &mut self,
+        ranges: Vec<Vec<PartitionRange>>,
+        distinguish_partition_range: bool,
+    ) -> Result<(), BoxedError> {
        self.properties.partitions = ranges;
+        self.properties.distinguish_partition_range = distinguish_partition_range;
        Ok(())
    }

--- a/src/mito2/src/read/unordered_scan.rs
+++ b/src/mito2/src/read/unordered_scan.rs
@@ -140,7 +140,9 @@ impl UnorderedScan {
                let mut metrics = ScannerMetrics::default();
                let mut fetch_start = Instant::now();
                #[cfg(debug_assertions)]
-                let mut checker = crate::read::BatchChecker::default();
+                let mut checker = crate::read::BatchChecker::default()
+                    .with_start(Some(part_range.start))
+                    .with_end(Some(part_range.end));

                let stream = Self::scan_partition_range(
                    stream_ctx.clone(),
@@ -209,8 +211,13 @@ impl RegionScanner for UnorderedScan {
        self.stream_ctx.input.mapper.output_schema()
    }

-    fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
+    fn prepare(
+        &mut self,
+        ranges: Vec<Vec<PartitionRange>>,
+        distinguish_partition_range: bool,
+    ) -> Result<(), BoxedError> {
        self.properties.partitions = ranges;
+        self.properties.distinguish_partition_range = distinguish_partition_range;
        Ok(())
    }

--- a/src/query/Cargo.toml
+++ b/src/query/Cargo.toml
@@ -64,6 +64,7 @@ store-api.workspace = true
 substrait.workspace = true
 table.workspace = true
 tokio.workspace = true
+uuid.workspace = true

 [dev-dependencies]
 approx_eq = "0.1"
--- a/src/query/src/lib.rs
+++ b/src/query/src/lib.rs
@@ -15,6 +15,7 @@
 #![feature(let_chains)]
 #![feature(int_roundings)]
 #![feature(trait_upcasting)]
+#![feature(try_blocks)]

 mod analyze;
 pub mod dataframe;
--- a/src/query/src/optimizer.rs
+++ b/src/query/src/optimizer.rs
@@ -20,6 +20,7 @@ pub mod string_normalization;
 #[cfg(test)]
 pub(crate) mod test_util;
 pub mod type_conversion;
+pub mod windowed_sort;

 use datafusion_common::config::ConfigOptions;
 use datafusion_common::Result;
--- a/src/query/src/optimizer/parallelize_scan.rs
+++ b/src/query/src/optimizer/parallelize_scan.rs
@@ -51,16 +51,27 @@ impl ParallelizeScan {
        let result = plan
            .transform_down(|plan| {
                if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
+                    if region_scan_exec.is_partition_set() {
+                        return Ok(Transformed::no(plan));
+                    }
+
                    let ranges = region_scan_exec.get_partition_ranges();
                    let total_range_num = ranges.len();
                    let expected_partition_num = config.execution.target_partitions;

                    // assign ranges to each partition
-                    let partition_ranges =
+                    let mut partition_ranges =
                        Self::assign_partition_range(ranges, expected_partition_num);
                    debug!(
                        "Assign {total_range_num} ranges to {expected_partition_num} partitions"
                    );
+
+                    // sort the ranges in each partition
+                    // TODO(ruihang): smart sort!
+                    for ranges in partition_ranges.iter_mut() {
+                        ranges.sort_by(|a, b| a.start.cmp(&b.start));
+                    }
+
                    // update the partition ranges
                    let new_exec = region_scan_exec
                        .with_new_partitions(partition_ranges)
--- a/src/query/src/optimizer/windowed_sort.rs
+++ b/src/query/src/optimizer/windowed_sort.rs
@@ -0,0 +1,158 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::physical_optimizer::PhysicalOptimizerRule;
+use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion::physical_plan::repartition::RepartitionExec;
+use datafusion::physical_plan::sorts::sort::SortExec;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::Result as DataFusionResult;
+use datafusion_physical_expr::expressions::Column as PhysicalColumn;
+use store_api::region_engine::PartitionRange;
+use table::table::scan::RegionScanExec;
+
+use crate::part_sort::PartSortExec;
+use crate::window_sort::WindowedSortExec;
+
+/// Optimize rule for windowed sort.
+///
+/// This is expected to run after [`ScanHint`] and [`ParallelizeScan`].
+/// It would change the original sort to a custom plan. To make sure
+/// other rules are applied correctly, this rule can be run as later as
+/// possible.
+///
+/// [`ScanHint`]: crate::optimizer::scan_hint::ScanHintRule
+/// [`ParallelizeScan`]: crate::optimizer::parallelize_scan::ParallelizeScan
+pub struct WindowedSortPhysicalRule;
+
+impl PhysicalOptimizerRule for WindowedSortPhysicalRule {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &datafusion::config::ConfigOptions,
+    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
+        Self::do_optimize(plan, config)
+    }
+
+    fn name(&self) -> &str {
+        "WindowedSortRule"
+    }
+
+    fn schema_check(&self) -> bool {
+        false
+    }
+}
+
+impl WindowedSortPhysicalRule {
+    fn do_optimize(
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &datafusion::config::ConfigOptions,
+    ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
+        let result = plan
+            .transform_down(|plan| {
+                if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
+                    // TODO: support multiple expr in windowed sort
+                    if !sort_exec.preserve_partitioning() || sort_exec.expr().len() != 1 {
+                        return Ok(Transformed::no(plan));
+                    }
+
+                    let Some(scanner_info) = fetch_partition_range(sort_exec.input().clone())?
+                    else {
+                        return Ok(Transformed::no(plan));
+                    };
+
+                    if let Some(first_sort_expr) = sort_exec.expr().first()
+                        && !first_sort_expr.options.descending
+                        && let Some(column_expr) = first_sort_expr
+                            .expr
+                            .as_any()
+                            .downcast_ref::<PhysicalColumn>()
+                        && column_expr.name() == scanner_info.time_index
+                    {
+                    } else {
+                        return Ok(Transformed::no(plan));
+                    }
+
+                    let first_sort_expr = sort_exec.expr().first().unwrap().clone();
+                    let part_sort_exec = Arc::new(PartSortExec::new(
+                        first_sort_expr.clone(),
+                        scanner_info.partition_ranges.clone(),
+                        sort_exec.input().clone(),
+                    ));
+                    let windowed_sort_exec = WindowedSortExec::try_new(
+                        first_sort_expr,
+                        sort_exec.fetch(),
+                        scanner_info.partition_ranges,
+                        part_sort_exec,
+                    )?;
+
+                    return Ok(Transformed {
+                        data: Arc::new(windowed_sort_exec),
+                        transformed: true,
+                        tnr: datafusion_common::tree_node::TreeNodeRecursion::Stop,
+                    });
+                }
+
+                Ok(Transformed::no(plan))
+            })?
+            .data;
+
+        Ok(result)
+    }
+}
+
+struct ScannerInfo {
+    partition_ranges: Vec<Vec<PartitionRange>>,
+    time_index: String,
+}
+
+fn fetch_partition_range(input: Arc<dyn ExecutionPlan>) -> DataFusionResult<Option<ScannerInfo>> {
+    let mut partition_ranges = None;
+    let mut time_index = None;
+
+    input.transform_up(|plan| {
+        // Unappliable case, reset the state.
+        if plan.as_any().is::<RepartitionExec>()
+            || plan.as_any().is::<CoalesceBatchesExec>()
+            || plan.as_any().is::<CoalescePartitionsExec>()
+            || plan.as_any().is::<SortExec>()
+            || plan.as_any().is::<WindowedSortExec>()
+        {
+            partition_ranges = None;
+        }
+
+        if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
+            partition_ranges = Some(region_scan_exec.get_uncollapsed_partition_ranges());
+            time_index = region_scan_exec.time_index();
+
+            // set distinguish_partition_ranges to true, this is an incorrect workaround
+            region_scan_exec.with_distinguish_partition_range(true);
+        }
+
+        Ok(Transformed::no(plan))
+    })?;
+
+    let result = try {
+        ScannerInfo {
+            partition_ranges: partition_ranges?,
+            time_index: time_index?,
+        }
+    };
+
+    Ok(result)
+}
--- a/src/query/src/part_sort.rs
+++ b/src/query/src/part_sort.rs
@@ -17,6 +17,7 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};

+use arrow::array::ArrayRef;
 use arrow::compute::{concat, take_record_batch};
 use arrow_schema::SchemaRef;
 use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream};
@@ -24,7 +25,7 @@ use datafusion::common::arrow::compute::sort_to_indices;
 use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion::execution::{RecordBatchStream, TaskContext};
 use datafusion::physical_plan::coalesce_batches::concat_batches;
-use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use datafusion::physical_plan::{
    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
 };
@@ -33,8 +34,9 @@ use datafusion_physical_expr::PhysicalSortExpr;
 use futures::Stream;
 use itertools::Itertools;
 use snafu::location;
+use store_api::region_engine::PartitionRange;

-use crate::error::Result;
+use crate::downcast_ts_array;

 /// Sort input within given PartitionRange
 ///
@@ -48,11 +50,16 @@ pub struct PartSortExec {
    input: Arc<dyn ExecutionPlan>,
    /// Execution metrics
    metrics: ExecutionPlanMetricsSet,
+    partition_ranges: Vec<Vec<PartitionRange>>,
    properties: PlanProperties,
 }

 impl PartSortExec {
-    pub fn try_new(expression: PhysicalSortExpr, input: Arc<dyn ExecutionPlan>) -> Result<Self> {
+    pub fn new(
+        expression: PhysicalSortExpr,
+        partition_ranges: Vec<Vec<PartitionRange>>,
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Self {
        let metrics = ExecutionPlanMetricsSet::new();
        let properties = PlanProperties::new(
            input.equivalence_properties().clone(),
@@ -60,12 +67,13 @@ impl PartSortExec {
            input.execution_mode(),
        );

-        Ok(Self {
+        Self {
            expression,
            input,
            metrics,
+            partition_ranges,
            properties,
-        })
+        }
    }

    pub fn to_stream(
@@ -76,7 +84,21 @@ impl PartSortExec {
        let input_stream: DfSendableRecordBatchStream =
            self.input.execute(partition, context.clone())?;

-        let df_stream = Box::pin(PartSortStream::new(context, self, input_stream)) as _;
+        if partition >= self.partition_ranges.len() {
+            internal_err!(
+                "Partition index out of range: {} >= {}",
+                partition,
+                self.partition_ranges.len()
+            )?;
+        }
+
+        let df_stream = Box::pin(PartSortStream::new(
+            context,
+            self,
+            input_stream,
+            self.partition_ranges[partition].clone(),
+            partition,
+        )) as _;

        Ok(df_stream)
    }
@@ -114,10 +136,11 @@ impl ExecutionPlan for PartSortExec {
        } else {
            internal_err!("No children found")?
        };
-        Ok(Arc::new(Self::try_new(
+        Ok(Arc::new(Self::new(
            self.expression.clone(),
+            self.partition_ranges.clone(),
            new_input.clone(),
-        )?))
+        )))
    }

    fn execute(
@@ -131,6 +154,15 @@ impl ExecutionPlan for PartSortExec {
    fn metrics(&self) -> Option<MetricsSet> {
        Some(self.metrics.clone_inner())
    }
+
+    /// # Explain
+    ///
+    /// This plan needs to be executed on each partition independently,
+    /// and is expected to run directly on storage engine's output
+    /// distribution / partition.
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
 }

 struct PartSortStream {
@@ -142,6 +174,10 @@ struct PartSortStream {
    input: DfSendableRecordBatchStream,
    input_complete: bool,
    schema: SchemaRef,
+    partition_ranges: Vec<PartitionRange>,
+    partition: usize,
+    cur_part_idx: usize,
+    metrics: BaselineMetrics,
 }

 impl PartSortStream {
@@ -149,6 +185,8 @@ impl PartSortStream {
        context: Arc<TaskContext>,
        sort: &PartSortExec,
        input: DfSendableRecordBatchStream,
+        partition_ranges: Vec<PartitionRange>,
+        partition: usize,
    ) -> Self {
        Self {
            reservation: MemoryConsumer::new("PartSortStream".to_string())
@@ -159,17 +197,85 @@ impl PartSortStream {
            input,
            input_complete: false,
            schema: sort.input.schema(),
+            partition_ranges,
+            partition,
+            cur_part_idx: 0,
+            metrics: BaselineMetrics::new(&sort.metrics, partition),
        }
    }
 }

+macro_rules! array_check_helper {
+    ($t:ty, $unit:expr, $arr:expr, $cur_range:expr, $min_max_idx:expr) => {{
+            if $cur_range.start.unit().as_arrow_time_unit() != $unit
+            || $cur_range.end.unit().as_arrow_time_unit() != $unit
+        {
+            internal_err!(
+                "PartitionRange unit mismatch, expect {:?}, found {:?}",
+                $cur_range.start.unit(),
+                $unit
+            )?;
+        }
+        let arr = $arr
+            .as_any()
+            .downcast_ref::<arrow::array::PrimitiveArray<$t>>()
+            .unwrap();
+
+        let min = arr.value($min_max_idx.0);
+        let max = arr.value($min_max_idx.1);
+        let (min, max) = if min < max{
+            (min, max)
+        } else {
+            (max, min)
+        };
+        let cur_min = $cur_range.start.value();
+        let cur_max = $cur_range.end.value();
+        // note that PartitionRange is left inclusive and right exclusive
+        if !(min >= cur_min && max < cur_max) {
+            internal_err!(
+                "Sort column min/max value out of partition range: sort_column.min_max=[{:?}, {:?}] not in PartitionRange=[{:?}, {:?}]",
+                min,
+                max,
+                cur_min,
+                cur_max
+            )?;
+        }
+    }};
+}
+
 impl PartSortStream {
+    /// check whether the sort column's min/max value is within the partition range
+    fn check_in_range(
+        &self,
+        sort_column: &ArrayRef,
+        min_max_idx: (usize, usize),
+    ) -> datafusion_common::Result<()> {
+        if self.cur_part_idx >= self.partition_ranges.len() {
+            internal_err!(
+                "Partition index out of range: {} >= {}",
+                self.cur_part_idx,
+                self.partition_ranges.len()
+            )?;
+        }
+        let cur_range = self.partition_ranges[self.cur_part_idx];
+
+        downcast_ts_array!(
+            sort_column.data_type() => (array_check_helper, sort_column, cur_range, min_max_idx),
+            _ => internal_err!(
+                "Unsupported data type for sort column: {:?}",
+                sort_column.data_type()
+            )?,
+        );
+
+        Ok(())
+    }
+
    /// Sort and clear the buffer and return the sorted record batch
    ///
-    /// this function should return None if RecordBatch is empty
-    fn sort_buffer(&mut self) -> datafusion_common::Result<Option<DfRecordBatch>> {
-        if self.buffer.iter().map(|r| r.num_rows()).sum::<usize>() == 0 {
-            return Ok(None);
+    /// this function should return a empty record batch if the buffer is empty
+    fn sort_buffer(&mut self) -> datafusion_common::Result<DfRecordBatch> {
+        if self.buffer.is_empty() {
+            return Ok(DfRecordBatch::new_empty(self.schema.clone()));
        }
        let mut sort_columns = Vec::with_capacity(self.buffer.len());
        let mut opt = None;
@@ -194,6 +300,24 @@ impl PartSortStream {
            )
        })?;

+        self.check_in_range(
+            &sort_column,
+            (
+                indices.value(0) as usize,
+                indices.value(indices.len() - 1) as usize,
+            ),
+        )
+        .inspect_err(|_e| {
+            #[cfg(debug_assertions)]
+            common_telemetry::error!(
+                "Fail to check sort column in range at {}, current_idx: {}, num_rows: {}, err: {}",
+                self.partition,
+                self.cur_part_idx,
+                sort_column.len(),
+                _e
+            );
+        })?;
+
        // reserve memory for the concat input and sorted output
        let total_mem: usize = self.buffer.iter().map(|r| r.get_array_memory_size()).sum();
        self.reservation.try_grow(total_mem * 2)?;
@@ -229,7 +353,7 @@ impl PartSortStream {
        drop(full_input);
        // here remove both buffer and full_input memory
        self.reservation.shrink(2 * total_mem);
-        Ok(Some(sorted))
+        Ok(sorted)
    }

    pub fn poll_next_inner(
@@ -241,7 +365,7 @@ impl PartSortStream {
                if self.buffer.is_empty() {
                    return Poll::Ready(None);
                } else {
-                    return Poll::Ready(self.sort_buffer().transpose());
+                    return Poll::Ready(Some(self.sort_buffer()));
                }
            }
            let res = self.input.as_mut().poll_next(cx);
@@ -249,7 +373,13 @@ impl PartSortStream {
                Poll::Ready(Some(Ok(batch))) => {
                    if batch.num_rows() == 0 {
                        // mark end of current PartitionRange
-                        return Poll::Ready(self.sort_buffer().transpose());
+                        let sorted_batch = self.sort_buffer()?;
+                        self.cur_part_idx += 1;
+                        if sorted_batch.num_rows() == 0 {
+                            // Current part is empty, continue polling next part.
+                            continue;
+                        }
+                        return Poll::Ready(Some(Ok(sorted_batch)));
                    }
                    self.buffer.push(batch);
                    // keep polling until boundary(a empty RecordBatch) is reached
@@ -271,10 +401,11 @@ impl Stream for PartSortStream {
    type Item = datafusion_common::Result<DfRecordBatch>;

    fn poll_next(
-        self: Pin<&mut Self>,
+        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
-        self.poll_next_inner(cx)
+        let result = self.as_mut().poll_next_inner(cx);
+        self.metrics.record_poll(result)
    }
 }

@@ -290,7 +421,6 @@ mod test {

    use arrow::json::ArrayWriter;
    use arrow_schema::{DataType, Field, Schema, SortOptions, TimeUnit};
-    use common_telemetry::error;
    use common_time::Timestamp;
    use datafusion_physical_expr::expressions::Column;
    use futures::StreamExt;
@@ -311,6 +441,8 @@ mod test {
        let mut rng = fastrand::Rng::new();
        rng.seed(1337);

+        let mut test_cases = Vec::new();
+
        for case_id in 0..test_cnt {
            let mut bound_val: Option<i64> = None;
            let descending = rng.bool();
@@ -359,21 +491,23 @@ mod test {
                };
                assert!(start < end);

-                let mut sort_data = vec![];
+                let mut per_part_sort_data = vec![];
                let mut batches = vec![];
                for _batch_idx in 0..rng.usize(1..batch_cnt_bound) {
-                    let cnt = rng.usize(0..batch_size_bound) + 2;
-                    let iter = 0..rng.usize(1..cnt);
+                    let cnt = rng.usize(0..batch_size_bound) + 1;
+                    let iter = 0..rng.usize(0..cnt);
                    let data_gen = iter
                        .map(|_| rng.i64(start.value()..end.value()))
                        .collect_vec();
-                    sort_data.extend(data_gen.clone());
+                    if data_gen.is_empty() {
+                        // current batch is empty, skip
+                        continue;
+                    }
+                    per_part_sort_data.extend(data_gen.clone());
                    let arr = new_ts_array(unit.clone(), data_gen.clone());
-
                    let batch = DfRecordBatch::try_new(schema.clone(), vec![arr]).unwrap();
                    batches.push(batch);
                }
-                assert!(batches.iter().all(|i| i.num_rows() >= 1));

                let range = PartitionRange {
                    start,
@@ -384,12 +518,14 @@ mod test {
                input_ranged_data.push((range, batches));

                if descending {
-                    sort_data.sort_by(|a, b| b.cmp(a));
+                    per_part_sort_data.sort_by(|a, b| b.cmp(a));
                } else {
-                    sort_data.sort();
+                    per_part_sort_data.sort();
                }
-
-                output_data.push(sort_data);
+                if per_part_sort_data.is_empty() {
+                    continue;
+                }
+                output_data.push(per_part_sort_data);
            }

            let expected_output = output_data
@@ -399,8 +535,17 @@ mod test {
                        .unwrap()
                })
                .collect_vec();
+            test_cases.push((
+                case_id,
+                unit,
+                input_ranged_data,
+                schema,
+                opt,
+                expected_output,
+            ));
+        }

-            assert!(!expected_output.is_empty());
+        for (case_id, _unit, input_ranged_data, schema, opt, expected_output) in test_cases {
            run_test(case_id, input_ranged_data, schema, opt, expected_output).await;
        }
    }
@@ -412,25 +557,50 @@ mod test {
                TimeUnit::Millisecond,
                vec![
                    ((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8, 9]]),
-                    ((5, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
+                    ((5, 10), vec![vec![5, 6], vec![7, 8]]),
                ],
                false,
-                vec![
-                    vec![1, 2, 3, 4, 5, 6, 7, 8, 9],
-                    vec![1, 2, 3, 4, 5, 6, 7, 8],
-                ],
+                vec![vec![1, 2, 3, 4, 5, 6, 7, 8, 9], vec![5, 6, 7, 8]],
            ),
            (
                TimeUnit::Millisecond,
                vec![
-                    ((5, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8, 9]]),
+                    ((5, 10), vec![vec![5, 6], vec![7, 8, 9]]),
                    ((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
                ],
                true,
+                vec![vec![9, 8, 7, 6, 5], vec![8, 7, 6, 5, 4, 3, 2, 1]],
+            ),
+            (
+                TimeUnit::Millisecond,
                vec![
-                    vec![9, 8, 7, 6, 5, 4, 3, 2, 1],
-                    vec![8, 7, 6, 5, 4, 3, 2, 1],
+                    ((5, 10), vec![]),
+                    ((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
                ],
+                true,
+                vec![vec![8, 7, 6, 5, 4, 3, 2, 1]],
+            ),
+            (
+                TimeUnit::Millisecond,
+                vec![
+                    ((15, 20), vec![vec![17, 18, 19]]),
+                    ((10, 15), vec![]),
+                    ((5, 10), vec![]),
+                    ((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
+                ],
+                true,
+                vec![vec![19, 18, 17], vec![8, 7, 6, 5, 4, 3, 2, 1]],
+            ),
+            (
+                TimeUnit::Millisecond,
+                vec![
+                    ((15, 20), vec![]),
+                    ((10, 15), vec![]),
+                    ((5, 10), vec![]),
+                    ((0, 10), vec![]),
+                ],
+                true,
+                vec![],
            ),
        ];

@@ -487,7 +657,7 @@ mod test {
        opt: SortOptions,
        expected_output: Vec<DfRecordBatch>,
    ) {
-        let (_ranges, batches): (Vec<_>, Vec<_>) = input_ranged_data.clone().into_iter().unzip();
+        let (ranges, batches): (Vec<_>, Vec<_>) = input_ranged_data.clone().into_iter().unzip();

        let batches = batches
            .into_iter()
@@ -498,14 +668,14 @@ mod test {
            .collect_vec();
        let mock_input = MockInputExec::new(batches, schema.clone());

-        let exec = PartSortExec::try_new(
+        let exec = PartSortExec::new(
            PhysicalSortExpr {
                expr: Arc::new(Column::new("ts", 0)),
                options: opt,
            },
+            vec![ranges],
            Arc::new(mock_input),
-        )
-        .unwrap();
+        );

        let exec_stream = exec.execute(0, Arc::new(TaskContext::default())).unwrap();

@@ -513,6 +683,7 @@ mod test {

        // a makeshift solution for compare large data
        if real_output != expected_output {
+            let mut full_msg = String::new();
            {
                let mut buf = Vec::with_capacity(10 * real_output.len());
                for batch in &real_output {
@@ -523,8 +694,9 @@ mod test {
                    buf.append(&mut rb_json);
                    buf.push(b',');
                }
-                let buf = String::from_utf8_lossy(&buf);
-                error!("case_id:{case_id}, real_output: [{buf}]");
+                // TODO(discord9): better ways to print buf
+                let _buf = String::from_utf8_lossy(&buf);
+                full_msg += &format!("case_id:{case_id}, real_output");
            }
            {
                let mut buf = Vec::with_capacity(10 * real_output.len());
@@ -536,10 +708,13 @@ mod test {
                    buf.append(&mut rb_json);
                    buf.push(b',');
                }
-                let buf = String::from_utf8_lossy(&buf);
-                error!("case_id:{case_id}, expected_output: [{buf}]");
+                let _buf = String::from_utf8_lossy(&buf);
+                full_msg += &format!("case_id:{case_id}, expected_output");
            }
-            panic!("case_{} failed, opt: {:?}", case_id, opt);
+            panic!(
+                "case_{} failed, opt: {:?}, full msg: {}",
+                case_id, opt, full_msg
+            );
        }
    }
 }
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -49,6 +49,7 @@ use crate::optimizer::remove_duplicate::RemoveDuplicate;
 use crate::optimizer::scan_hint::ScanHintRule;
 use crate::optimizer::string_normalization::StringNormalizationRule;
 use crate::optimizer::type_conversion::TypeConversionRule;
+use crate::optimizer::windowed_sort::WindowedSortPhysicalRule;
 use crate::optimizer::ExtensionAnalyzerRule;
 use crate::query_engine::options::QueryOptions;
 use crate::query_engine::DefaultSerializer;
@@ -120,6 +121,10 @@ impl QueryEngineState {
        physical_optimizer
            .rules
            .insert(0, Arc::new(ParallelizeScan));
+        // Add rule for windowed sort
+        physical_optimizer
+            .rules
+            .push(Arc::new(WindowedSortPhysicalRule));
        // Add rule to remove duplicate nodes generated by other rules. Run this in the last.
        physical_optimizer.rules.push(Arc::new(RemoveDuplicate));

--- a/src/query/src/window_sort.rs
+++ b/src/query/src/window_sort.rs
@@ -21,10 +21,6 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};

-use arrow::array::types::{
-    TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
-    TimestampSecondType,
-};
 use arrow::array::{Array, ArrayRef, PrimitiveArray};
 use arrow::compute::SortColumn;
 use arrow_schema::{DataType, SchemaRef, SortOptions};
@@ -58,30 +54,57 @@ use crate::error::{QueryExecutionSnafu, Result};
 ///
 /// internally, it call [`streaming_merge`] multiple times to merge multiple sorted "working ranges"
 ///
-/// the input stream must be concated in the order of `PartitionRange` in `ranges`(and each `PartitionRange` is sorted within itself), or else the result will be incorrect
+/// # Invariant Promise on Input Stream
+/// 1. The input stream must be sorted by timestamp and
+/// 2. in the order of `PartitionRange` in `ranges`
+/// 3. Each `PartitionRange` is sorted within itself(ascending or descending) but no need to be sorted across ranges
+/// 4. There can't be any RecordBatch that is cross multiple `PartitionRange` in the input stream
+///
+///  TODO(discord9): fix item 4, but since only use `PartSort` as input, this might not be a problem
+
 #[derive(Debug, Clone)]
 pub struct WindowedSortExec {
    /// Physical sort expressions(that is, sort by timestamp)
    expression: PhysicalSortExpr,
    /// Optional number of rows to fetch. Stops producing rows after this fetch
    fetch: Option<usize>,
-    /// The input ranges indicate input stream will be composed of those ranges in given order
-    ranges: Vec<PartitionRange>,
+    /// The input ranges indicate input stream will be composed of those ranges in given order.
+    ///
+    /// Each partition has one vector of `PartitionRange`.
+    ranges: Vec<Vec<PartitionRange>>,
    /// All available working ranges and their corresponding working set
    ///
-    /// working ranges promise once input stream get a value out of current range, future values will never be in this range
-    all_avail_working_range: Vec<(TimeRange, BTreeSet<usize>)>,
+    /// working ranges promise once input stream get a value out of current range, future values will never
+    /// be in this range. Each partition has one vector of ranges.
+    all_avail_working_range: Vec<Vec<(TimeRange, BTreeSet<usize>)>>,
    input: Arc<dyn ExecutionPlan>,
    /// Execution metrics
    metrics: ExecutionPlanMetricsSet,
    properties: PlanProperties,
 }

-fn check_partition_range_monotonicity(ranges: &[PartitionRange], descending: bool) -> bool {
+fn check_partition_range_monotonicity(
+    ranges: &[Vec<PartitionRange>],
+    descending: bool,
+) -> Result<()> {
+    let is_valid = ranges.iter().all(|r| {
        if descending {
-        ranges.windows(2).all(|w| w[0].end >= w[1].end)
+            r.windows(2).all(|w| w[0].end >= w[1].end)
        } else {
-        ranges.windows(2).all(|w| w[0].start <= w[1].start)
+            r.windows(2).all(|w| w[0].start <= w[1].start)
+        }
+    });
+
+    if !is_valid {
+        let msg = if descending {
+            "Input `PartitionRange`s's upper bound is not monotonic non-increase"
+        } else {
+            "Input `PartitionRange`s's lower bound is not monotonic non-decrease"
+        };
+        let plain_error = PlainError::new(msg.to_string(), StatusCode::Unexpected);
+        Err(BoxedError::new(plain_error)).context(QueryExecutionSnafu {})
+    } else {
+        Ok(())
    }
 }

@@ -89,28 +112,28 @@ impl WindowedSortExec {
    pub fn try_new(
        expression: PhysicalSortExpr,
        fetch: Option<usize>,
-        ranges: Vec<PartitionRange>,
+        ranges: Vec<Vec<PartitionRange>>,
        input: Arc<dyn ExecutionPlan>,
    ) -> Result<Self> {
-        if !check_partition_range_monotonicity(&ranges, expression.options.descending) {
-            let msg = if expression.options.descending {
-                "Input `PartitionRange`s's upper bound is not monotonic non-increase"
-            } else {
-                "Input `PartitionRange`s's lower bound is not monotonic non-decrease"
-            };
-            let plain_error = PlainError::new(msg.to_string(), StatusCode::Unexpected);
-            return Err(BoxedError::new(plain_error)).context(QueryExecutionSnafu {});
-        }
+        check_partition_range_monotonicity(&ranges, expression.options.descending)?;

        let properties = PlanProperties::new(
-            input.equivalence_properties().clone(),
+            input
+                .equivalence_properties()
+                .clone()
+                .with_reorder(vec![expression.clone()]),
            input.output_partitioning().clone(),
            input.execution_mode(),
        );

-        let overlap_counts = split_overlapping_ranges(&ranges);
-        let all_avail_working_range =
+        let mut all_avail_working_range = Vec::with_capacity(ranges.len());
+        for r in &ranges {
+            let overlap_counts = split_overlapping_ranges(r);
+            let working_ranges =
                compute_all_working_ranges(&overlap_counts, expression.options.descending);
+            all_avail_working_range.push(working_ranges);
+        }
+
        Ok(Self {
            expression,
            fetch,
@@ -123,7 +146,8 @@ impl WindowedSortExec {
    }

    /// During receiving partial-sorted RecordBatch, we need to update the working set which is the
-    /// `PartitionRange` we think those RecordBatch belongs to. And when we receive something outside of working set, we can merge results before whenever possible.
+    /// `PartitionRange` we think those RecordBatch belongs to. And when we receive something outside
+    /// of working set, we can merge results before whenever possible.
    pub fn to_stream(
        &self,
        context: Arc<TaskContext>,
@@ -132,7 +156,12 @@ impl WindowedSortExec {
        let input_stream: DfSendableRecordBatchStream =
            self.input.execute(partition, context.clone())?;

-        let df_stream = Box::pin(WindowedSortStream::new(context, self, input_stream)) as _;
+        let df_stream = Box::pin(WindowedSortStream::new(
+            context,
+            self,
+            input_stream,
+            partition,
+        )) as _;

        Ok(df_stream)
    }
@@ -190,6 +219,15 @@ impl ExecutionPlan for WindowedSortExec {
    fn metrics(&self) -> Option<MetricsSet> {
        Some(self.metrics.clone_inner())
    }
+
+    /// # Explain
+    ///
+    /// This plan needs to be executed on each partition independently,
+    /// and is expected to run directly on storage engine's output
+    /// distribution / partition.
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false; self.ranges.len()]
+    }
 }

 /// The core logic of merging sort multiple sorted ranges
@@ -215,8 +253,8 @@ pub struct WindowedSortStream {
    working_idx: usize,
    /// input stream assumed reading in order of `PartitionRange`
    input: DfSendableRecordBatchStream,
-    /// Whether the input stream is terminated, if it did, we should poll input again
-    is_input_terminated: bool,
+    /// Whether this stream is terminated. For reasons like limit reached or input stream is done.
+    is_terminated: bool,
    /// Output Schema, which is the same as input schema, since this is a sort plan
    schema: SchemaRef,
    /// Physical sort expressions(that is, sort by timestamp)
@@ -231,8 +269,10 @@ pub struct WindowedSortStream {
    ///
    /// working ranges promise once input stream get a value out of current range, future values will never be in this range
    all_avail_working_range: Vec<(TimeRange, BTreeSet<usize>)>,
+    /// The input partition ranges
+    ranges: Vec<PartitionRange>,
    /// Execution metrics
-    metrics: ExecutionPlanMetricsSet,
+    metrics: BaselineMetrics,
 }

 impl WindowedSortStream {
@@ -240,6 +280,7 @@ impl WindowedSortStream {
        context: Arc<TaskContext>,
        exec: &WindowedSortExec,
        input: DfSendableRecordBatchStream,
+        partition: usize,
    ) -> Self {
        Self {
            memory_pool: context.runtime_env().memory_pool.clone(),
@@ -251,18 +292,79 @@ impl WindowedSortStream {
            working_idx: 0,
            schema: input.schema(),
            input,
-            is_input_terminated: false,
+            is_terminated: false,
            expression: exec.expression.clone(),
            fetch: exec.fetch,
            produced: 0,
            batch_size: context.session_config().batch_size(),
-            all_avail_working_range: exec.all_avail_working_range.clone(),
-            metrics: exec.metrics.clone(),
+            all_avail_working_range: exec.all_avail_working_range[partition].clone(),
+            ranges: exec.ranges[partition].clone(),
+            metrics: BaselineMetrics::new(&exec.metrics, partition),
        }
    }
 }

 impl WindowedSortStream {
+    #[cfg(debug_assertions)]
+    fn check_subset_ranges(&self, cur_range: &TimeRange) {
+        let cur_is_subset_to = self
+            .ranges
+            .iter()
+            .filter(|r| cur_range.is_subset(&TimeRange::from(*r)))
+            .collect_vec();
+        if cur_is_subset_to.is_empty() {
+            error!("Current range is not a subset of any PartitionRange");
+            // found in all ranges that are subset of current range
+            let subset_ranges = self
+                .ranges
+                .iter()
+                .filter(|r| TimeRange::from(*r).is_subset(cur_range))
+                .collect_vec();
+            let only_overlap = self
+                .ranges
+                .iter()
+                .filter(|r| {
+                    let r = TimeRange::from(*r);
+                    r.is_overlapping(cur_range) && !r.is_subset(cur_range)
+                })
+                .collect_vec();
+            error!(
+                "Bad input, found {} ranges that are subset of current range, also found {} ranges that only overlap, subset ranges are: {:?}; overlap ranges are: {:?}",
+                subset_ranges.len(),
+                only_overlap.len(),
+                subset_ranges,
+                only_overlap
+            );
+        } else {
+            let only_overlap = self
+                .ranges
+                .iter()
+                .filter(|r| {
+                    let r = TimeRange::from(*r);
+                    r.is_overlapping(cur_range) && !cur_range.is_subset(&r)
+                })
+                .collect_vec();
+            error!(
+                "Found current range to be subset of {} ranges, also found {} ranges that only overlap, of subset ranges are:{:?}; overlap ranges are: {:?}",
+                cur_is_subset_to.len(),
+                only_overlap.len(),
+                cur_is_subset_to,
+                only_overlap
+            );
+        }
+        let all_overlap_working_range = self
+            .all_avail_working_range
+            .iter()
+            .filter(|(range, _)| range.is_overlapping(cur_range))
+            .map(|(range, _)| range)
+            .collect_vec();
+        error!(
+            "Found {} working ranges that overlap with current range: {:?}",
+            all_overlap_working_range.len(),
+            all_overlap_working_range
+        );
+    }
+
    /// Poll the next RecordBatch from the merge-sort's output stream
    fn poll_result_stream(
        &mut self,
@@ -273,6 +375,7 @@ impl WindowedSortStream {
                Poll::Ready(Some(Ok(batch))) => {
                    let ret = if let Some(remaining) = self.remaining_fetch() {
                        if remaining == 0 {
+                            self.is_terminated = true;
                            None
                        } else if remaining < batch.num_rows() {
                            self.produced += remaining;
@@ -304,6 +407,7 @@ impl WindowedSortStream {
        // if no output stream is available
        Poll::Ready(None)
    }
+
    /// The core logic of merging sort multiple sorted ranges
    ///
    /// We try to maximize the number of sorted runs we can merge in one go, while emit the result as soon as possible.
@@ -314,7 +418,7 @@ impl WindowedSortStream {
        // first check and send out the merge result
        match self.poll_result_stream(cx) {
            Poll::Ready(None) => {
-                if self.is_input_terminated {
+                if self.is_terminated {
                    return Poll::Ready(None);
                }
            }
@@ -322,7 +426,7 @@ impl WindowedSortStream {
        };

        // consume input stream
-        while !self.is_input_terminated {
+        while !self.is_terminated {
            // then we get a new RecordBatch from input stream
            let new_input_rbs = match self.input.as_mut().poll_next(cx) {
                Poll::Ready(Some(Ok(batch))) => {
@@ -333,70 +437,66 @@ impl WindowedSortStream {
                }
                Poll::Ready(None) => {
                    // input stream is done, we need to merge sort the remaining working set
-                    self.is_input_terminated = true;
+                    self.is_terminated = true;
                    None
                }
                Poll::Pending => return Poll::Pending,
            };

-            // The core logic to eargerly merge sort the working set
-            match new_input_rbs {
-                Some(SortedRunSet {
+            let Some(SortedRunSet {
                runs_with_batch,
                sort_column,
-                }) => {
+            }) = new_input_rbs
+            else {
+                // input stream is done, we need to merge sort the remaining working set
+                self.build_sorted_stream()?;
+                self.start_new_merge_sort()?;
+                continue;
+            };
+            // The core logic to eargerly merge sort the working set
+
            // compare with last_value to find boundary, then merge runs if needed

            // iterate over runs_with_batch to merge sort, might create zero or more stream to put to `sort_partition_rbs`
-
            let mut last_remaining = None;
            let mut run_iter = runs_with_batch.into_iter();
            loop {
-                        let (sorted_rb, run_info) = if let Some(r) = last_remaining.take() {
-                            r
-                        } else if let Some(r) = run_iter.next() {
-                            r
-                        } else {
+                let Some((sorted_rb, run_info)) = last_remaining.take().or(run_iter.next()) else {
                    break;
                };
                if sorted_rb.num_rows() == 0 {
                    continue;
                }
                // determine if this batch is in current working range
-                        let cur_range = {
-                            match run_info.get_time_range() {
-                                Some(r) => r,
-                                _ => internal_err!("Found NULL in time index column")?,
-                            }
+                let Some(cur_range) = run_info.get_time_range() else {
+                    internal_err!("Found NULL in time index column")?
                };
-                        let working_range = if let Some(r) = self.get_working_range() {
-                            r
-                        } else {
+                let Some(working_range) = self.get_working_range() else {
                    internal_err!("No working range found")?
                };

+                // ensure the current batch is in the working range
                if sort_column.options.unwrap_or_default().descending {
                    if cur_range.end > working_range.end {
                        error!("Invalid range: {:?} > {:?}", cur_range, working_range);
+                        #[cfg(debug_assertions)]
+                        self.check_subset_ranges(&cur_range);
                        internal_err!("Current batch have data on the right side of working range, something is very wrong")?;
                    }
                } else if cur_range.start < working_range.start {
                    error!("Invalid range: {:?} < {:?}", cur_range, working_range);
+                    #[cfg(debug_assertions)]
+                    self.check_subset_ranges(&cur_range);
                    internal_err!("Current batch have data on the left side of working range, something is very wrong")?;
                }

                if cur_range.is_subset(&working_range) {
                    // data still in range, can't merge sort yet
                    // see if can concat entire sorted rb, merge sort need to wait
-                            self.try_concat_batch(
-                                sorted_rb.clone(),
-                                &run_info,
-                                sort_column.options,
-                            )?;
+                    self.try_concat_batch(sorted_rb.clone(), &run_info, sort_column.options)?;
                } else if let Some(intersection) = cur_range.intersection(&working_range) {
                    // slice rb by intersection and concat it then merge sort
-                            let cur_sort_column =
-                                sort_column.values.slice(run_info.offset, run_info.len);
+                    let cur_sort_column = sort_column.values.slice(run_info.offset, run_info.len);
                    let (offset, len) = find_slice_from_range(
                        &SortColumn {
                            values: cur_sort_column.clone(),
@@ -419,13 +519,11 @@ impl WindowedSortStream {
                    // since we are crossing working range, we need to merge sort the working set
                    self.start_new_merge_sort()?;

-                            let (r_offset, r_len) =
-                                (offset + len, sorted_rb.num_rows() - offset - len);
+                    let (r_offset, r_len) = (offset + len, sorted_rb.num_rows() - offset - len);
                    if r_len != 0 {
                        // we have remaining data in this batch, put it back to input queue
                        let remaining_rb = sorted_rb.slice(r_offset, r_len);
-                                let new_first_val =
-                                    get_timestamp_from_idx(&cur_sort_column, r_offset)?;
+                        let new_first_val = get_timestamp_from_idx(&cur_sort_column, r_offset)?;
                        let new_run_info = SucRun {
                            offset: run_info.offset + r_offset,
                            len: r_len,
@@ -450,14 +548,6 @@ impl WindowedSortStream {
                }
            }
        }
-                None => {
-                    // input stream is done, we need to merge sort the remaining working set
-                    // and emit the result
-                    self.build_sorted_stream()?;
-                    self.start_new_merge_sort()?;
-                }
-            }
-        }
        // emit the merge result
        self.poll_result_stream(cx)
    }
@@ -540,7 +630,7 @@ impl WindowedSortStream {
            streams,
            self.schema(),
            &[self.expression.clone()],
-            BaselineMetrics::new(&self.metrics, 0),
+            self.metrics.clone(),
            self.batch_size,
            fetch,
            reservation,
@@ -562,10 +652,11 @@ impl Stream for WindowedSortStream {
    type Item = datafusion_common::Result<DfRecordBatch>;

    fn poll_next(
-        self: Pin<&mut Self>,
+        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
-        self.poll_next_inner(cx)
+        let result = self.as_mut().poll_next_inner(cx);
+        self.metrics.record_poll(result)
    }
 }

@@ -624,21 +715,22 @@ fn split_batch_to_sorted_run(
 /// Downcast a temporal array to a specific type
 ///
 /// usage similar to `downcast_primitive!` in `arrow-array` crate
+#[macro_export]
 macro_rules! downcast_ts_array {
    ($data_type:expr => ($m:path $(, $args:tt)*), $($p:pat => $fallback:expr $(,)*)*) =>
    {
        match $data_type {
            arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => {
-                $m!(TimestampSecondType $(, $args)*)
+                $m!(arrow::datatypes::TimestampSecondType, arrow_schema::TimeUnit::Second $(, $args)*)
            }
            arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => {
-                $m!(TimestampMillisecondType $(, $args)*)
+                $m!(arrow::datatypes::TimestampMillisecondType, arrow_schema::TimeUnit::Millisecond $(, $args)*)
            }
            arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => {
-                $m!(TimestampMicrosecondType $(, $args)*)
+                $m!(arrow::datatypes::TimestampMicrosecondType, arrow_schema::TimeUnit::Microsecond $(, $args)*)
            }
            arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => {
-                $m!(TimestampNanosecondType $(, $args)*)
+                $m!(arrow::datatypes::TimestampNanosecondType, arrow_schema::TimeUnit::Nanosecond $(, $args)*)
            }
            $($p => $fallback,)*
        }
@@ -711,7 +803,7 @@ fn find_slice_from_range(
 }

 macro_rules! array_iter_helper {
-    ($t:ty, $arr:expr) => {{
+    ($t:ty, $unit:expr, $arr:expr) => {{
        let typed = $arr.as_any().downcast_ref::<PrimitiveArray<$t>>().unwrap();
        let iter = typed.iter().enumerate();
        Box::new(iter) as Box<dyn Iterator<Item = (usize, Option<i64>)>>
@@ -993,7 +1085,8 @@ fn compute_all_working_ranges(
        match &mut cur_range_set {
            None => cur_range_set = Some((*range, BTreeSet::from_iter(set.iter().cloned()))),
            Some((working_range, working_set)) => {
-                // if next overlap range have Partition that's is not last one in `working_set`(hence need to be read before merge sorting), and `working_set` have >1 count
+                // if next overlap range have Partition that's is not last one in `working_set`(hence need
+                // to be read before merge sorting), and `working_set` have >1 count
                // we have to expand current working range to cover it(and add it's `set` to `working_set`)
                // so that merge sort is possible
                let need_expand = {
@@ -2410,7 +2503,7 @@ mod test {
            let exec = WindowedSortExec::try_new(
                self.expression.clone(),
                self.fetch,
-                ranges,
+                vec![ranges],
                Arc::new(mock_input),
            )
            .unwrap();
@@ -2441,6 +2534,78 @@ mod test {
                vec![],
                vec![],
            ),
+            TestStream::new(
+                Column::new("ts", 0),
+                SortOptions {
+                    descending: false,
+                    nulls_first: true,
+                },
+                None,
+                vec![Field::new(
+                    "ts",
+                    DataType::Timestamp(TimeUnit::Millisecond, None),
+                    false,
+                )],
+                vec![
+                    // test one empty
+                    (
+                        PartitionRange {
+                            start: Timestamp::new_millisecond(1),
+                            end: Timestamp::new_millisecond(2),
+                            num_rows: 1,
+                            identifier: 0,
+                        },
+                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
+                    ),
+                    (
+                        PartitionRange {
+                            start: Timestamp::new_millisecond(1),
+                            end: Timestamp::new_millisecond(3),
+                            num_rows: 1,
+                            identifier: 0,
+                        },
+                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([2]))],
+                    ),
+                ],
+                vec![vec![Arc::new(TimestampMillisecondArray::from_iter_values(
+                    [2],
+                ))]],
+            ),
+            TestStream::new(
+                Column::new("ts", 0),
+                SortOptions {
+                    descending: false,
+                    nulls_first: true,
+                },
+                None,
+                vec![Field::new(
+                    "ts",
+                    DataType::Timestamp(TimeUnit::Millisecond, None),
+                    false,
+                )],
+                vec![
+                    // test one empty
+                    (
+                        PartitionRange {
+                            start: Timestamp::new_millisecond(1),
+                            end: Timestamp::new_millisecond(2),
+                            num_rows: 1,
+                            identifier: 0,
+                        },
+                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
+                    ),
+                    (
+                        PartitionRange {
+                            start: Timestamp::new_millisecond(1),
+                            end: Timestamp::new_millisecond(3),
+                            num_rows: 1,
+                            identifier: 0,
+                        },
+                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
+                    ),
+                ],
+                vec![],
+            ),
            TestStream::new(
                Column::new("ts", 0),
                SortOptions {
--- a/src/store-api/src/region_engine.rs
+++ b/src/store-api/src/region_engine.rs
@@ -205,7 +205,7 @@ pub struct ScannerProperties {
    total_rows: usize,

    /// Whether to yield an empty batch to distinguish partition ranges.
-    distinguish_partition_range: bool,
+    pub distinguish_partition_range: bool,
 }

 impl ScannerProperties {
@@ -227,12 +227,6 @@ impl ScannerProperties {
        self
    }

-    /// Sets distinguish partition range for scanner.
-    pub fn with_distinguish_partition_range(mut self, distinguish_partition_range: bool) -> Self {
-        self.distinguish_partition_range = distinguish_partition_range;
-        self
-    }
-
    /// Creates a new [`ScannerProperties`] with the given partitioning.
    pub fn new(partitions: Vec<Vec<PartitionRange>>, append_mode: bool, total_rows: usize) -> Self {
        Self {
@@ -274,7 +268,11 @@ pub trait RegionScanner: Debug + DisplayAs + Send {
    /// Prepares the scanner with the given partition ranges.
    ///
    /// This method is for the planner to adjust the scanner's behavior based on the partition ranges.
-    fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError>;
+    fn prepare(
+        &mut self,
+        ranges: Vec<Vec<PartitionRange>>,
+        distinguish_partition_range: bool,
+    ) -> Result<(), BoxedError>;

    /// Scans the partition and returns a stream of record batches.
    ///
@@ -441,8 +439,13 @@ impl RegionScanner for SinglePartitionScanner {
        self.schema.clone()
    }

-    fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
+    fn prepare(
+        &mut self,
+        ranges: Vec<Vec<PartitionRange>>,
+        distinguish_partition_range: bool,
+    ) -> Result<(), BoxedError> {
        self.properties.partitions = ranges;
+        self.properties.distinguish_partition_range = distinguish_partition_range;
        Ok(())
    }

--- a/src/table/src/table/scan.rs
+++ b/src/table/src/table/scan.rs
@@ -22,6 +22,7 @@ use common_error::ext::BoxedError;
 use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream, SendableRecordBatchStream};
 use common_telemetry::tracing::Span;
 use common_telemetry::tracing_context::TracingContext;
+use common_telemetry::warn;
 use datafusion::error::Result as DfResult;
 use datafusion::execution::context::TaskContext;
 use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
@@ -49,6 +50,7 @@ pub struct RegionScanExec {
    properties: PlanProperties,
    append_mode: bool,
    total_rows: usize,
+    is_partition_set: bool,
 }

 impl RegionScanExec {
@@ -77,15 +79,10 @@ impl RegionScanExec {
            properties,
            append_mode,
            total_rows,
+            is_partition_set: false,
        }
    }

-    /// Set the expected output ordering for the plan.
-    pub fn with_output_ordering(mut self, output_ordering: Vec<PhysicalSortExpr>) -> Self {
-        self.output_ordering = Some(output_ordering);
-        self
-    }
-
    /// Get the partition ranges of the scanner. This method will collapse the ranges into
    /// a single vector.
    pub fn get_partition_ranges(&self) -> Vec<PartitionRange> {
@@ -101,18 +98,33 @@ impl RegionScanExec {
        ranges
    }

+    /// Similar to [`Self::get_partition_ranges`] but don't collapse the ranges.
+    pub fn get_uncollapsed_partition_ranges(&self) -> Vec<Vec<PartitionRange>> {
+        let scanner = self.scanner.lock().unwrap();
+        scanner.properties().partitions.clone()
+    }
+
+    pub fn is_partition_set(&self) -> bool {
+        self.is_partition_set
+    }
+
    /// Update the partition ranges of underlying scanner.
    pub fn with_new_partitions(
        &self,
        partitions: Vec<Vec<PartitionRange>>,
    ) -> Result<Self, BoxedError> {
+        if self.is_partition_set {
+            warn!("Setting partition ranges more than once for RegionScanExec");
+        }
+
        let num_partitions = partitions.len();
        let mut properties = self.properties.clone();
        properties.partitioning = Partitioning::UnknownPartitioning(num_partitions);

        {
            let mut scanner = self.scanner.lock().unwrap();
-            scanner.prepare(partitions)?;
+            let distinguish_partition_range = scanner.properties().distinguish_partition_range();
+            scanner.prepare(partitions, distinguish_partition_range)?;
        }

        Ok(Self {
@@ -123,8 +135,25 @@ impl RegionScanExec {
            properties,
            append_mode: self.append_mode,
            total_rows: self.total_rows,
+            is_partition_set: true,
        })
    }
+
+    pub fn with_distinguish_partition_range(&self, distinguish_partition_range: bool) {
+        let mut scanner = self.scanner.lock().unwrap();
+        let partition_ranges = scanner.properties().partitions.clone();
+        // set distinguish_partition_range won't fail
+        let _ = scanner.prepare(partition_ranges, distinguish_partition_range);
+    }
+
+    pub fn time_index(&self) -> Option<String> {
+        self.scanner
+            .lock()
+            .unwrap()
+            .schema()
+            .timestamp_column()
+            .map(|x| x.name.clone())
+    }
 }

 impl ExecutionPlan for RegionScanExec {
--- a/tests-fuzz/targets/fuzz_insert_logical_table.rs
+++ b/tests-fuzz/targets/fuzz_insert_logical_table.rs
@@ -189,6 +189,7 @@ async fn validate_values(
        "SELECT {} FROM {} ORDER BY {}",
        column_list, logical_table_ctx.name, primary_keys_column_list
    );
+    info!("Select SQL: {select_sql}");
    let fetched_rows = validator::row::fetch_values(&ctx.greptime, select_sql.as_str()).await?;
    let mut expected_rows =
        replace_default(&insert_expr.values_list, &logical_table_ctx, insert_expr);
@@ -214,6 +215,8 @@ async fn insert_values<R: Rng + 'static>(
        .await
        .context(error::ExecuteQuerySnafu { sql: &sql })?;

+    info!("Insert values, result: {result:?}");
+
    ensure!(
        result.rows_affected() == rows as u64,
        error::AssertSnafu {
--- a/tests/cases/standalone/common/order/windowed_sort.result
+++ b/tests/cases/standalone/common/order/windowed_sort.result
@@ -0,0 +1,81 @@
+CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX);
+
+Affected Rows: 0
+
+INSERT INTO test VALUES (1, 1), (NULL, 2), (1, 3);
+
+Affected Rows: 3
+
+ADMIN FLUSH_TABLE('test');
+
+---------------------------+
+| ADMIN FLUSH_TABLE('test') |
+---------------------------+
+| 0                         |
+---------------------------+
+
+INSERT INTO test VALUES (2, 4), (2, 5), (NULL, 6);
+
+Affected Rows: 3
+
+ADMIN FLUSH_TABLE('test');
+
+---------------------------+
+| ADMIN FLUSH_TABLE('test') |
+---------------------------+
+| 0                         |
+---------------------------+
+
+INSERT INTO test VALUES (3, 7), (3, 8), (3, 9);
+
+Affected Rows: 3
+
+ADMIN FLUSH_TABLE('test');
+
+---------------------------+
+| ADMIN FLUSH_TABLE('test') |
+---------------------------+
+| 0                         |
+---------------------------+
+
+INSERT INTO test VALUES (4, 10), (4, 11), (4, 12);
+
+Affected Rows: 3
+
+SELECT * FROM test ORDER BY t LIMIT 5;
+
+---+-------------------------+
+| i | t                       |
+---+-------------------------+
+| 1 | 1970-01-01T00:00:00.001 |
+|   | 1970-01-01T00:00:00.002 |
+| 1 | 1970-01-01T00:00:00.003 |
+| 2 | 1970-01-01T00:00:00.004 |
+| 2 | 1970-01-01T00:00:00.005 |
+---+-------------------------+
+
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5;
+
+-+-+-+
+| stage | node | plan_|
+-+-+-+
+| 0_| 0_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
+|_|_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST] REDACTED
+|_|_|_WindowedSortExec REDACTED
+|_|_|_PartSortExec t@1 ASC NULLS LAST REDACTED
+|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
+|_|_|_|
+|_|_| Total rows: 5_|
+-+-+-+
+
+DROP TABLE test;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/order/windowed_sort.sql
+++ b/tests/cases/standalone/common/order/windowed_sort.sql
@@ -0,0 +1,26 @@
+CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX);
+
+INSERT INTO test VALUES (1, 1), (NULL, 2), (1, 3);
+
+ADMIN FLUSH_TABLE('test');
+
+INSERT INTO test VALUES (2, 4), (2, 5), (NULL, 6);
+
+ADMIN FLUSH_TABLE('test');
+
+INSERT INTO test VALUES (3, 7), (3, 8), (3, 9);
+
+ADMIN FLUSH_TABLE('test');
+
+INSERT INTO test VALUES (4, 10), (4, 11), (4, 12);
+
+SELECT * FROM test ORDER BY t LIMIT 5;
+
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5;
+
+DROP TABLE test;
--- a/tests/cases/standalone/common/tql-explain-analyze/explain.result
+++ b/tests/cases/standalone/common/tql-explain-analyze/explain.result
@@ -183,6 +183,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
 | physical_plan after LimitAggregation_| SAME TEXT AS ABOVE_|
 | physical_plan after ProjectionPushdown_| SAME TEXT AS ABOVE_|
 | physical_plan after PipelineChecker_| SAME TEXT AS ABOVE_|
+| physical_plan after WindowedSortRule_| SAME TEXT AS ABOVE_|
 | physical_plan after RemoveDuplicateRule_| SAME TEXT AS ABOVE_|
 | physical_plan_| PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[j]_|
 |_|_PromSeriesNormalizeExec: offset=[0], time index=[j], filter NaN: [false]_|