mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-23 08:20:36 +00:00
3006 lines
105 KiB
Rust
3006 lines
105 KiB
Rust
// Copyright 2023 Greptime Team
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//! Module for sorting input data within each [`PartitionRange`].
|
|
//!
|
|
//! This module defines the [`PartSortExec`] execution plan, which sorts each
|
|
//! partition ([`PartitionRange`]) independently based on the provided physical
|
|
//! sort expressions.
|
|
|
|
use std::any::Any;
|
|
use std::pin::Pin;
|
|
use std::sync::Arc;
|
|
use std::task::{Context, Poll};
|
|
|
|
use arrow::array::{
|
|
ArrayRef, AsArray, TimestampMicrosecondArray, TimestampMillisecondArray,
|
|
TimestampNanosecondArray, TimestampSecondArray,
|
|
};
|
|
use arrow::compute::{concat, concat_batches, take_record_batch};
|
|
use arrow_schema::{Schema, SchemaRef};
|
|
use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream};
|
|
use common_telemetry::warn;
|
|
use common_time::Timestamp;
|
|
use common_time::timestamp::TimeUnit;
|
|
use datafusion::common::arrow::compute::sort_to_indices;
|
|
use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
|
|
use datafusion::execution::{RecordBatchStream, TaskContext};
|
|
use datafusion::physical_plan::execution_plan::CardinalityEffect;
|
|
use datafusion::physical_plan::filter_pushdown::{
|
|
ChildFilterDescription, FilterDescription, FilterPushdownPhase,
|
|
};
|
|
use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
|
|
use datafusion::physical_plan::{
|
|
DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, TopK,
|
|
TopKDynamicFilters,
|
|
};
|
|
use datafusion_common::tree_node::{Transformed, TreeNode};
|
|
use datafusion_common::{DataFusionError, internal_err};
|
|
use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr, lit};
|
|
use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
|
|
use futures::{Stream, StreamExt};
|
|
use itertools::Itertools;
|
|
use parking_lot::RwLock;
|
|
use snafu::location;
|
|
use store_api::region_engine::PartitionRange;
|
|
|
|
use crate::error::Result;
|
|
use crate::window_sort::{check_partition_range_monotonicity, project_partition_range_for_sort};
|
|
use crate::{array_iter_helper, downcast_ts_array};
|
|
|
|
/// Get the primary end of a `PartitionRange` based on sort direction.
|
|
///
|
|
/// - Descending: primary end is `end` (we process highest values first)
|
|
/// - Ascending: primary end is `start` (we process lowest values first)
|
|
fn get_primary_end(range: &PartitionRange, descending: bool) -> Timestamp {
|
|
if descending { range.end } else { range.start }
|
|
}
|
|
|
|
/// Group consecutive ranges by their primary end value.
|
|
///
|
|
/// Returns a vector of (primary_end, start_idx_inclusive, end_idx_exclusive) tuples.
|
|
/// Ranges with the same primary end MUST be processed together because they may
|
|
/// overlap and contain values that belong to the same "top-k" result.
|
|
fn group_ranges_by_primary_end(
|
|
ranges: &[PartitionRange],
|
|
descending: bool,
|
|
) -> Vec<(Timestamp, usize, usize)> {
|
|
if ranges.is_empty() {
|
|
return vec![];
|
|
}
|
|
|
|
let mut groups = Vec::new();
|
|
let mut group_start = 0;
|
|
let mut current_primary_end = get_primary_end(&ranges[0], descending);
|
|
|
|
for (idx, range) in ranges.iter().enumerate().skip(1) {
|
|
let primary_end = get_primary_end(range, descending);
|
|
if primary_end != current_primary_end {
|
|
// End current group
|
|
groups.push((current_primary_end, group_start, idx));
|
|
// Start new group
|
|
group_start = idx;
|
|
current_primary_end = primary_end;
|
|
}
|
|
}
|
|
// Push the last group
|
|
groups.push((current_primary_end, group_start, ranges.len()));
|
|
|
|
groups
|
|
}
|
|
|
|
/// Sort input within given PartitionRange
|
|
///
|
|
/// Input is assumed to be segmented by empty RecordBatch, which indicates a new `PartitionRange` is starting
|
|
///
|
|
/// and this operator will sort each partition independently within the partition.
|
|
#[derive(Debug, Clone)]
|
|
pub struct PartSortExec {
|
|
/// Physical sort expressions(that is, sort by timestamp)
|
|
expression: PhysicalSortExpr,
|
|
limit: Option<usize>,
|
|
input: Arc<dyn ExecutionPlan>,
|
|
/// Execution metrics
|
|
metrics: ExecutionPlanMetricsSet,
|
|
partition_ranges: Vec<Vec<PartitionRange>>,
|
|
properties: Arc<PlanProperties>,
|
|
/// Filter matching the state of the sort for dynamic filter pushdown.
|
|
/// If `limit` is `Some`, this will also be set and a TopK operator may be used.
|
|
/// If `limit` is `None`, this will be `None`.
|
|
filter: Option<Arc<RwLock<TopKDynamicFilters>>>,
|
|
}
|
|
|
|
impl PartSortExec {
|
|
pub fn try_new(
|
|
expression: PhysicalSortExpr,
|
|
limit: Option<usize>,
|
|
partition_ranges: Vec<Vec<PartitionRange>>,
|
|
input: Arc<dyn ExecutionPlan>,
|
|
) -> Result<Self> {
|
|
check_partition_range_monotonicity(&partition_ranges, expression.options.descending)?;
|
|
|
|
let metrics = ExecutionPlanMetricsSet::new();
|
|
let properties = input.properties();
|
|
let properties = Arc::new(PlanProperties::new(
|
|
input.equivalence_properties().clone(),
|
|
input.output_partitioning().clone(),
|
|
properties.emission_type,
|
|
properties.boundedness,
|
|
));
|
|
|
|
let filter = limit
|
|
.is_some()
|
|
.then(|| Self::create_filter(expression.expr.clone()));
|
|
|
|
Ok(Self {
|
|
expression,
|
|
limit,
|
|
input,
|
|
metrics,
|
|
partition_ranges,
|
|
properties,
|
|
filter,
|
|
})
|
|
}
|
|
|
|
/// Add or reset `self.filter` to a new `TopKDynamicFilters`.
|
|
fn create_filter(expr: Arc<dyn PhysicalExpr>) -> Arc<RwLock<TopKDynamicFilters>> {
|
|
Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
|
|
DynamicFilterPhysicalExpr::new(vec![expr], lit(true)),
|
|
))))
|
|
}
|
|
|
|
pub fn to_stream(
|
|
&self,
|
|
context: Arc<TaskContext>,
|
|
partition: usize,
|
|
) -> datafusion_common::Result<DfSendableRecordBatchStream> {
|
|
let input_stream: DfSendableRecordBatchStream =
|
|
self.input.execute(partition, context.clone())?;
|
|
|
|
if partition >= self.partition_ranges.len() {
|
|
internal_err!(
|
|
"Partition index out of range: {} >= {} at {}",
|
|
partition,
|
|
self.partition_ranges.len(),
|
|
snafu::location!()
|
|
)?;
|
|
}
|
|
|
|
let df_stream = Box::pin(PartSortStream::new(
|
|
context,
|
|
self,
|
|
self.limit,
|
|
input_stream,
|
|
self.partition_ranges[partition].clone(),
|
|
partition,
|
|
self.filter.clone(),
|
|
)?) as _;
|
|
|
|
Ok(df_stream)
|
|
}
|
|
}
|
|
|
|
impl DisplayAs for PartSortExec {
|
|
fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
write!(
|
|
f,
|
|
"PartSortExec: expr={} num_ranges={}",
|
|
self.expression,
|
|
self.partition_ranges.len(),
|
|
)?;
|
|
if let Some(limit) = self.limit {
|
|
write!(f, " limit={}", limit)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl ExecutionPlan for PartSortExec {
|
|
fn name(&self) -> &str {
|
|
"PartSortExec"
|
|
}
|
|
|
|
fn as_any(&self) -> &dyn Any {
|
|
self
|
|
}
|
|
|
|
fn schema(&self) -> SchemaRef {
|
|
self.input.schema()
|
|
}
|
|
|
|
fn properties(&self) -> &Arc<PlanProperties> {
|
|
&self.properties
|
|
}
|
|
|
|
fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
|
|
vec![&self.input]
|
|
}
|
|
|
|
fn with_new_children(
|
|
self: Arc<Self>,
|
|
children: Vec<Arc<dyn ExecutionPlan>>,
|
|
) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
|
|
let new_input = if let Some(first) = children.first() {
|
|
first
|
|
} else {
|
|
internal_err!("No children found")?
|
|
};
|
|
let mut new_exec = self.as_ref().clone();
|
|
new_exec.input = new_input.clone();
|
|
new_exec.properties = new_input.properties().clone();
|
|
Ok(Arc::new(new_exec))
|
|
}
|
|
|
|
fn execute(
|
|
&self,
|
|
partition: usize,
|
|
context: Arc<TaskContext>,
|
|
) -> datafusion_common::Result<DfSendableRecordBatchStream> {
|
|
self.to_stream(context, partition)
|
|
}
|
|
|
|
fn metrics(&self) -> Option<MetricsSet> {
|
|
Some(self.metrics.clone_inner())
|
|
}
|
|
|
|
/// # Explain
|
|
///
|
|
/// This plan needs to be executed on each partition independently,
|
|
/// and is expected to run directly on storage engine's output
|
|
/// distribution / partition.
|
|
fn benefits_from_input_partitioning(&self) -> Vec<bool> {
|
|
vec![false]
|
|
}
|
|
|
|
fn cardinality_effect(&self) -> CardinalityEffect {
|
|
if self.limit.is_none() {
|
|
CardinalityEffect::Equal
|
|
} else {
|
|
CardinalityEffect::LowerEqual
|
|
}
|
|
}
|
|
|
|
fn gather_filters_for_pushdown(
|
|
&self,
|
|
phase: FilterPushdownPhase,
|
|
parent_filters: Vec<Arc<dyn PhysicalExpr>>,
|
|
_config: &datafusion::config::ConfigOptions,
|
|
) -> datafusion_common::Result<FilterDescription> {
|
|
if !matches!(phase, FilterPushdownPhase::Post) {
|
|
return FilterDescription::from_children(parent_filters, &self.children());
|
|
}
|
|
|
|
let mut child = ChildFilterDescription::from_child(&parent_filters, &self.input)?;
|
|
|
|
if let Some(filter) = &self.filter {
|
|
child = child.with_self_filter(filter.read().expr());
|
|
}
|
|
|
|
Ok(FilterDescription::new().with_child(child))
|
|
}
|
|
|
|
fn reset_state(self: Arc<Self>) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
|
|
// shared dynamic filter needs to be reset
|
|
let new_filter = self
|
|
.limit
|
|
.is_some()
|
|
.then(|| Self::create_filter(self.expression.expr.clone()));
|
|
|
|
Ok(Arc::new(Self {
|
|
expression: self.expression.clone(),
|
|
limit: self.limit,
|
|
input: self.input.clone(),
|
|
metrics: self.metrics.clone(),
|
|
partition_ranges: self.partition_ranges.clone(),
|
|
properties: self.properties.clone(),
|
|
filter: new_filter,
|
|
}))
|
|
}
|
|
}
|
|
|
|
enum PartSortBuffer {
|
|
All(Vec<DfRecordBatch>),
|
|
/// TopK buffer with row count.
|
|
///
|
|
/// Given this heap only keeps k element, the capacity of this buffer
|
|
/// is not accurate, and is only used for empty check.
|
|
Top(TopK, usize),
|
|
}
|
|
|
|
impl PartSortBuffer {
|
|
pub fn is_empty(&self) -> bool {
|
|
match self {
|
|
PartSortBuffer::All(v) => v.is_empty(),
|
|
PartSortBuffer::Top(_, cnt) => *cnt == 0,
|
|
}
|
|
}
|
|
}
|
|
|
|
struct PartSortStream {
|
|
/// Memory pool for this stream
|
|
reservation: MemoryReservation,
|
|
buffer: PartSortBuffer,
|
|
expression: PhysicalSortExpr,
|
|
limit: Option<usize>,
|
|
input: DfSendableRecordBatchStream,
|
|
input_complete: bool,
|
|
schema: SchemaRef,
|
|
partition_ranges: Vec<PartitionRange>,
|
|
#[allow(dead_code)] // this is used under #[debug_assertions]
|
|
partition: usize,
|
|
cur_part_idx: usize,
|
|
evaluating_batch: Option<DfRecordBatch>,
|
|
metrics: BaselineMetrics,
|
|
context: Arc<TaskContext>,
|
|
root_metrics: ExecutionPlanMetricsSet,
|
|
/// Groups of ranges by primary end: (primary_end, start_idx_inclusive, end_idx_exclusive).
|
|
/// Ranges in the same group must be processed together before outputting results.
|
|
range_groups: Vec<(Timestamp, usize, usize)>,
|
|
/// Current group being processed (index into range_groups).
|
|
cur_group_idx: usize,
|
|
/// Dynamic Filter for all TopK instance, notice the `PartSortExec`/`PartSortStream`/`TopK` must share the same filter
|
|
/// so that updates from each `TopK` can be seen by others(and by the table scan operator).
|
|
filter: Option<Arc<RwLock<TopKDynamicFilters>>>,
|
|
}
|
|
|
|
impl PartSortStream {
|
|
fn new(
|
|
context: Arc<TaskContext>,
|
|
sort: &PartSortExec,
|
|
limit: Option<usize>,
|
|
input: DfSendableRecordBatchStream,
|
|
partition_ranges: Vec<PartitionRange>,
|
|
partition: usize,
|
|
filter: Option<Arc<RwLock<TopKDynamicFilters>>>,
|
|
) -> datafusion_common::Result<Self> {
|
|
let buffer = if let Some(limit) = limit {
|
|
let Some(filter) = filter.clone() else {
|
|
return internal_err!(
|
|
"TopKDynamicFilters must be provided when limit is set at {}",
|
|
snafu::location!()
|
|
);
|
|
};
|
|
|
|
PartSortBuffer::Top(
|
|
TopK::try_new(
|
|
partition,
|
|
sort.schema().clone(),
|
|
vec![],
|
|
[sort.expression.clone()].into(),
|
|
limit,
|
|
context.session_config().batch_size(),
|
|
context.runtime_env(),
|
|
&sort.metrics,
|
|
filter.clone(),
|
|
)?,
|
|
0,
|
|
)
|
|
} else {
|
|
PartSortBuffer::All(Vec::new())
|
|
};
|
|
|
|
// Compute range groups by primary end
|
|
let descending = sort.expression.options.descending;
|
|
let range_groups = group_ranges_by_primary_end(&partition_ranges, descending);
|
|
|
|
Ok(Self {
|
|
reservation: MemoryConsumer::new("PartSortStream".to_string())
|
|
.register(&context.runtime_env().memory_pool),
|
|
buffer,
|
|
expression: sort.expression.clone(),
|
|
limit,
|
|
input,
|
|
input_complete: false,
|
|
schema: sort.input.schema(),
|
|
partition_ranges,
|
|
partition,
|
|
cur_part_idx: 0,
|
|
evaluating_batch: None,
|
|
metrics: BaselineMetrics::new(&sort.metrics, partition),
|
|
context,
|
|
root_metrics: sort.metrics.clone(),
|
|
range_groups,
|
|
cur_group_idx: 0,
|
|
filter,
|
|
})
|
|
}
|
|
}
|
|
|
|
macro_rules! array_check_helper {
|
|
($t:ty, $unit:expr, $arr:expr, $cur_range:expr, $min_max_idx:expr) => {{
|
|
if $cur_range.start.unit().as_arrow_time_unit() != $unit
|
|
|| $cur_range.end.unit().as_arrow_time_unit() != $unit
|
|
{
|
|
internal_err!(
|
|
"PartitionRange unit mismatch, expect {:?}, found {:?}",
|
|
$cur_range.start.unit(),
|
|
$unit
|
|
)?;
|
|
}
|
|
let arr = $arr
|
|
.as_any()
|
|
.downcast_ref::<arrow::array::PrimitiveArray<$t>>()
|
|
.unwrap();
|
|
|
|
let min = arr.value($min_max_idx.0);
|
|
let max = arr.value($min_max_idx.1);
|
|
let (min, max) = if min < max{
|
|
(min, max)
|
|
} else {
|
|
(max, min)
|
|
};
|
|
let cur_min = $cur_range.start.value();
|
|
let cur_max = $cur_range.end.value();
|
|
// note that PartitionRange is left inclusive and right exclusive
|
|
if !(min >= cur_min && max < cur_max) {
|
|
internal_err!(
|
|
"Sort column min/max value out of partition range: sort_column.min_max=[{:?}, {:?}] not in PartitionRange=[{:?}, {:?}]",
|
|
min,
|
|
max,
|
|
cur_min,
|
|
cur_max
|
|
)?;
|
|
}
|
|
}};
|
|
}
|
|
|
|
impl PartSortStream {
|
|
/// check whether the sort column's min/max value is within the current group's effective range.
|
|
/// For group-based processing, data from multiple ranges with the same primary end
|
|
/// is accumulated together, so we check against the union of all ranges in the group.
|
|
fn check_in_range(
|
|
&self,
|
|
sort_column: &ArrayRef,
|
|
min_max_idx: (usize, usize),
|
|
) -> datafusion_common::Result<()> {
|
|
// Use the group's effective range instead of the current partition range
|
|
let Some(cur_range) = self.get_current_group_effective_range() else {
|
|
internal_err!(
|
|
"No effective range for current group {} at {}",
|
|
self.cur_group_idx,
|
|
snafu::location!()
|
|
)?
|
|
};
|
|
let cur_range = project_partition_range_for_sort(cur_range, sort_column.data_type())?;
|
|
|
|
downcast_ts_array!(
|
|
sort_column.data_type() => (array_check_helper, sort_column, cur_range, min_max_idx),
|
|
_ => internal_err!(
|
|
"Unsupported data type for sort column: {:?}",
|
|
sort_column.data_type()
|
|
)?,
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Try find data whose value exceeds the current partition range.
|
|
///
|
|
/// Returns `None` if no such data is found, and `Some(idx)` where idx points to
|
|
/// the first data that exceeds the current partition range.
|
|
fn try_find_next_range(
|
|
&self,
|
|
sort_column: &ArrayRef,
|
|
) -> datafusion_common::Result<Option<usize>> {
|
|
if sort_column.is_empty() {
|
|
return Ok(None);
|
|
}
|
|
|
|
// check if the current partition index is out of range
|
|
if self.cur_part_idx >= self.partition_ranges.len() {
|
|
internal_err!(
|
|
"Partition index out of range: {} >= {} at {}",
|
|
self.cur_part_idx,
|
|
self.partition_ranges.len(),
|
|
snafu::location!()
|
|
)?;
|
|
}
|
|
let cur_range = project_partition_range_for_sort(
|
|
self.partition_ranges[self.cur_part_idx],
|
|
sort_column.data_type(),
|
|
)?;
|
|
|
|
let sort_column_iter = downcast_ts_array!(
|
|
sort_column.data_type() => (array_iter_helper, sort_column),
|
|
_ => internal_err!(
|
|
"Unsupported data type for sort column: {:?}",
|
|
sort_column.data_type()
|
|
)?,
|
|
);
|
|
|
|
for (idx, val) in sort_column_iter {
|
|
// ignore vacant time index data
|
|
if let Some(val) = val
|
|
&& (val >= cur_range.end.value() || val < cur_range.start.value())
|
|
{
|
|
return Ok(Some(idx));
|
|
}
|
|
}
|
|
|
|
Ok(None)
|
|
}
|
|
|
|
fn push_buffer(&mut self, batch: DfRecordBatch) -> datafusion_common::Result<()> {
|
|
match &mut self.buffer {
|
|
PartSortBuffer::All(v) => v.push(batch),
|
|
PartSortBuffer::Top(top, cnt) => {
|
|
*cnt += batch.num_rows();
|
|
top.insert_batch(batch)?;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Stop read earlier when current group do not overlap with any of those next group
|
|
/// If not overlap, we can stop read further input as current top k is final
|
|
/// Use dynamic filter to evaluate the next group's primary end
|
|
fn can_stop_early(&mut self, schema: &Arc<Schema>) -> datafusion_common::Result<bool> {
|
|
let topk_cnt = match &self.buffer {
|
|
PartSortBuffer::Top(_, cnt) => *cnt,
|
|
_ => return Ok(false),
|
|
};
|
|
// not fulfill topk yet
|
|
if Some(topk_cnt) < self.limit {
|
|
return Ok(false);
|
|
}
|
|
let next_group_primary_end = if self.cur_group_idx + 1 < self.range_groups.len() {
|
|
self.range_groups[self.cur_group_idx + 1].0
|
|
} else {
|
|
// no next group
|
|
return Ok(false);
|
|
};
|
|
|
|
// dyn filter is updated based on the last value of topk heap("threshold")
|
|
// it's a max-heap for a ASC TopK operator
|
|
// so can use dyn filter to prune data range
|
|
let filter = self
|
|
.filter
|
|
.as_ref()
|
|
.expect("TopKDynamicFilters must be provided when limit is set");
|
|
let filter = filter.read().expr().current()?;
|
|
let mut ts_index = None;
|
|
// invariant: the filter must contain only the same column expr that's time index column
|
|
let filter = filter
|
|
.transform_down(|c| {
|
|
// rewrite all column's index as 0
|
|
if let Some(column) = c.as_any().downcast_ref::<Column>() {
|
|
ts_index = Some(column.index());
|
|
Ok(Transformed::yes(
|
|
Arc::new(Column::new(column.name(), 0)) as Arc<dyn PhysicalExpr>
|
|
))
|
|
} else {
|
|
Ok(Transformed::no(c))
|
|
}
|
|
})?
|
|
.data;
|
|
let Some(ts_index) = ts_index else {
|
|
return Ok(false); // dyn filter is still true, cannot decide, continue read
|
|
};
|
|
let field = if schema.fields().len() <= ts_index {
|
|
warn!(
|
|
"Schema mismatch when evaluating dynamic filter for PartSortExec at {}, schema: {:?}, ts_index: {}",
|
|
self.partition, schema, ts_index
|
|
);
|
|
return Ok(false); // schema mismatch, cannot decide, continue read
|
|
} else {
|
|
schema.field(ts_index)
|
|
};
|
|
let schema = Arc::new(Schema::new(vec![field.clone()]));
|
|
// convert next_group_primary_end to array&filter, if eval to false, means no overlap, can stop early
|
|
let primary_end_array = match next_group_primary_end.unit() {
|
|
TimeUnit::Second => Arc::new(TimestampSecondArray::from(vec![
|
|
next_group_primary_end.value(),
|
|
])) as ArrayRef,
|
|
TimeUnit::Millisecond => Arc::new(TimestampMillisecondArray::from(vec![
|
|
next_group_primary_end.value(),
|
|
])) as ArrayRef,
|
|
TimeUnit::Microsecond => Arc::new(TimestampMicrosecondArray::from(vec![
|
|
next_group_primary_end.value(),
|
|
])) as ArrayRef,
|
|
TimeUnit::Nanosecond => Arc::new(TimestampNanosecondArray::from(vec![
|
|
next_group_primary_end.value(),
|
|
])) as ArrayRef,
|
|
};
|
|
let primary_end_batch = DfRecordBatch::try_new(schema, vec![primary_end_array])?;
|
|
let res = filter.evaluate(&primary_end_batch)?;
|
|
let array = res.into_array(primary_end_batch.num_rows())?;
|
|
let filter = array.as_boolean().clone();
|
|
let overlap = filter.iter().next().flatten();
|
|
if let Some(false) = overlap {
|
|
Ok(true)
|
|
} else {
|
|
Ok(false)
|
|
}
|
|
}
|
|
|
|
/// Check if the given partition index is within the current group.
|
|
fn is_in_current_group(&self, part_idx: usize) -> bool {
|
|
if self.cur_group_idx >= self.range_groups.len() {
|
|
return false;
|
|
}
|
|
let (_, start, end) = self.range_groups[self.cur_group_idx];
|
|
part_idx >= start && part_idx < end
|
|
}
|
|
|
|
/// Advance to the next group. Returns true if there is a next group.
|
|
fn advance_to_next_group(&mut self) -> bool {
|
|
self.cur_group_idx += 1;
|
|
self.cur_group_idx < self.range_groups.len()
|
|
}
|
|
|
|
/// Get the effective range for the current group.
|
|
/// For a group of ranges with the same primary end, the effective range is
|
|
/// the union of all ranges in the group.
|
|
fn get_current_group_effective_range(&self) -> Option<PartitionRange> {
|
|
if self.cur_group_idx >= self.range_groups.len() {
|
|
return None;
|
|
}
|
|
let (_, start_idx, end_idx) = self.range_groups[self.cur_group_idx];
|
|
if start_idx >= end_idx || start_idx >= self.partition_ranges.len() {
|
|
return None;
|
|
}
|
|
|
|
let ranges_in_group =
|
|
&self.partition_ranges[start_idx..end_idx.min(self.partition_ranges.len())];
|
|
if ranges_in_group.is_empty() {
|
|
return None;
|
|
}
|
|
|
|
// Compute union of all ranges in the group
|
|
let mut min_start = ranges_in_group[0].start;
|
|
let mut max_end = ranges_in_group[0].end;
|
|
for range in ranges_in_group.iter().skip(1) {
|
|
if range.start < min_start {
|
|
min_start = range.start;
|
|
}
|
|
if range.end > max_end {
|
|
max_end = range.end;
|
|
}
|
|
}
|
|
|
|
Some(PartitionRange {
|
|
start: min_start,
|
|
end: max_end,
|
|
num_rows: 0, // Not used for validation
|
|
identifier: 0, // Not used for validation
|
|
})
|
|
}
|
|
|
|
/// Sort and clear the buffer and return the sorted record batch
|
|
///
|
|
/// this function will return a empty record batch if the buffer is empty
|
|
fn sort_buffer(&mut self) -> datafusion_common::Result<DfRecordBatch> {
|
|
match &mut self.buffer {
|
|
PartSortBuffer::All(_) => self.sort_all_buffer(),
|
|
PartSortBuffer::Top(_, _) => self.sort_top_buffer(),
|
|
}
|
|
}
|
|
|
|
/// Internal method for sorting `All` buffer (without limit).
|
|
fn sort_all_buffer(&mut self) -> datafusion_common::Result<DfRecordBatch> {
|
|
let PartSortBuffer::All(buffer) =
|
|
std::mem::replace(&mut self.buffer, PartSortBuffer::All(Vec::new()))
|
|
else {
|
|
unreachable!("buffer type is checked before and should be All variant")
|
|
};
|
|
|
|
if buffer.is_empty() {
|
|
return Ok(DfRecordBatch::new_empty(self.schema.clone()));
|
|
}
|
|
let mut sort_columns = Vec::with_capacity(buffer.len());
|
|
let mut opt = None;
|
|
for batch in buffer.iter() {
|
|
let sort_column = self.expression.evaluate_to_sort_column(batch)?;
|
|
opt = opt.or(sort_column.options);
|
|
sort_columns.push(sort_column.values);
|
|
}
|
|
|
|
let sort_column =
|
|
concat(&sort_columns.iter().map(|a| a.as_ref()).collect_vec()).map_err(|e| {
|
|
DataFusionError::ArrowError(
|
|
Box::new(e),
|
|
Some(format!("Fail to concat sort columns at {}", location!())),
|
|
)
|
|
})?;
|
|
|
|
let indices = sort_to_indices(&sort_column, opt, self.limit).map_err(|e| {
|
|
DataFusionError::ArrowError(
|
|
Box::new(e),
|
|
Some(format!("Fail to sort to indices at {}", location!())),
|
|
)
|
|
})?;
|
|
if indices.is_empty() {
|
|
return Ok(DfRecordBatch::new_empty(self.schema.clone()));
|
|
}
|
|
|
|
self.check_in_range(
|
|
&sort_column,
|
|
(
|
|
indices.value(0) as usize,
|
|
indices.value(indices.len() - 1) as usize,
|
|
),
|
|
)
|
|
.inspect_err(|_e| {
|
|
#[cfg(debug_assertions)]
|
|
common_telemetry::error!(
|
|
"Fail to check sort column in range at {}, current_idx: {}, num_rows: {}, err: {}",
|
|
self.partition,
|
|
self.cur_part_idx,
|
|
sort_column.len(),
|
|
_e
|
|
);
|
|
})?;
|
|
|
|
// reserve memory for the concat input and sorted output
|
|
let total_mem: usize = buffer.iter().map(|r| r.get_array_memory_size()).sum();
|
|
self.reservation.try_grow(total_mem * 2)?;
|
|
|
|
let full_input = concat_batches(&self.schema, &buffer).map_err(|e| {
|
|
DataFusionError::ArrowError(
|
|
Box::new(e),
|
|
Some(format!(
|
|
"Fail to concat input batches when sorting at {}",
|
|
location!()
|
|
)),
|
|
)
|
|
})?;
|
|
|
|
let sorted = take_record_batch(&full_input, &indices).map_err(|e| {
|
|
DataFusionError::ArrowError(
|
|
Box::new(e),
|
|
Some(format!(
|
|
"Fail to take result record batch when sorting at {}",
|
|
location!()
|
|
)),
|
|
)
|
|
})?;
|
|
|
|
drop(full_input);
|
|
// here remove both buffer and full_input memory
|
|
self.reservation.shrink(2 * total_mem);
|
|
Ok(sorted)
|
|
}
|
|
|
|
/// Internal method for sorting `Top` buffer (with limit).
|
|
fn sort_top_buffer(&mut self) -> datafusion_common::Result<DfRecordBatch> {
|
|
let Some(filter) = self.filter.clone() else {
|
|
return internal_err!(
|
|
"TopKDynamicFilters must be provided when sorting with limit at {}",
|
|
snafu::location!()
|
|
);
|
|
};
|
|
|
|
let new_top_buffer = TopK::try_new(
|
|
self.partition,
|
|
self.schema().clone(),
|
|
vec![],
|
|
[self.expression.clone()].into(),
|
|
self.limit.unwrap(),
|
|
self.context.session_config().batch_size(),
|
|
self.context.runtime_env(),
|
|
&self.root_metrics,
|
|
filter,
|
|
)?;
|
|
let PartSortBuffer::Top(top_k, _) =
|
|
std::mem::replace(&mut self.buffer, PartSortBuffer::Top(new_top_buffer, 0))
|
|
else {
|
|
unreachable!("buffer type is checked before and should be Top variant")
|
|
};
|
|
|
|
let mut result_stream = top_k.emit()?;
|
|
let mut placeholder_ctx = std::task::Context::from_waker(futures::task::noop_waker_ref());
|
|
let mut results = vec![];
|
|
// according to the current implementation of `TopK`, the result stream will always be ready
|
|
loop {
|
|
match result_stream.poll_next_unpin(&mut placeholder_ctx) {
|
|
Poll::Ready(Some(batch)) => {
|
|
let batch = batch?;
|
|
results.push(batch);
|
|
}
|
|
Poll::Pending => {
|
|
#[cfg(debug_assertions)]
|
|
unreachable!("TopK result stream should always be ready")
|
|
}
|
|
Poll::Ready(None) => {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
let concat_batch = concat_batches(&self.schema, &results).map_err(|e| {
|
|
DataFusionError::ArrowError(
|
|
Box::new(e),
|
|
Some(format!(
|
|
"Fail to concat top k result record batch when sorting at {}",
|
|
location!()
|
|
)),
|
|
)
|
|
})?;
|
|
|
|
Ok(concat_batch)
|
|
}
|
|
|
|
/// Sorts current buffer and returns `None` when there is nothing to emit.
|
|
fn sorted_buffer_if_non_empty(&mut self) -> datafusion_common::Result<Option<DfRecordBatch>> {
|
|
if self.buffer.is_empty() {
|
|
return Ok(None);
|
|
}
|
|
|
|
let sorted = self.sort_buffer()?;
|
|
if sorted.num_rows() == 0 {
|
|
Ok(None)
|
|
} else {
|
|
Ok(Some(sorted))
|
|
}
|
|
}
|
|
|
|
/// Try to split the input batch if it contains data that exceeds the current partition range.
|
|
///
|
|
/// When the input batch contains data that exceeds the current partition range, this function
|
|
/// will split the input batch into two parts, the first part is within the current partition
|
|
/// range will be merged and sorted with previous buffer, and the second part will be registered
|
|
/// to `evaluating_batch` for next polling.
|
|
///
|
|
/// **Group-based processing**: Ranges with the same primary end are grouped together.
|
|
/// We only sort and output when transitioning to a NEW group, not when moving between
|
|
/// ranges within the same group.
|
|
///
|
|
/// Returns `None` if the input batch is empty or fully within the current partition range
|
|
/// (or we're still collecting data within the same group), and `Some(batch)` when we've
|
|
/// completed a group and have sorted output. When operating in TopK (limit) mode, this
|
|
/// function will not emit intermediate batches; it only prepares state for a single final
|
|
/// output.
|
|
fn split_batch(
|
|
&mut self,
|
|
batch: DfRecordBatch,
|
|
) -> datafusion_common::Result<Option<DfRecordBatch>> {
|
|
if matches!(self.buffer, PartSortBuffer::Top(_, _)) {
|
|
self.split_batch_topk(batch)?;
|
|
return Ok(None);
|
|
}
|
|
|
|
self.split_batch_all(batch)
|
|
}
|
|
|
|
/// Specialized splitting logic for TopK (limit) mode.
|
|
///
|
|
/// We only emit once when the TopK buffer is fulfilled or when input is fully consumed.
|
|
/// When the buffer is fulfilled and we are about to enter a new group, we stop consuming
|
|
/// further ranges.
|
|
fn split_batch_topk(&mut self, batch: DfRecordBatch) -> datafusion_common::Result<()> {
|
|
if batch.num_rows() == 0 {
|
|
return Ok(());
|
|
}
|
|
|
|
let sort_column = self
|
|
.expression
|
|
.expr
|
|
.evaluate(&batch)?
|
|
.into_array(batch.num_rows())?;
|
|
|
|
let next_range_idx = self.try_find_next_range(&sort_column)?;
|
|
let Some(idx) = next_range_idx else {
|
|
self.push_buffer(batch)?;
|
|
// keep polling input for next batch
|
|
return Ok(());
|
|
};
|
|
|
|
let this_range = batch.slice(0, idx);
|
|
let remaining_range = batch.slice(idx, batch.num_rows() - idx);
|
|
if this_range.num_rows() != 0 {
|
|
self.push_buffer(this_range)?;
|
|
}
|
|
|
|
// Step to next proper PartitionRange
|
|
self.cur_part_idx += 1;
|
|
|
|
// If we've processed all partitions, mark completion.
|
|
if self.cur_part_idx >= self.partition_ranges.len() {
|
|
debug_assert!(remaining_range.num_rows() == 0);
|
|
self.input_complete = true;
|
|
return Ok(());
|
|
}
|
|
|
|
// Check if we're still in the same group
|
|
let in_same_group = self.is_in_current_group(self.cur_part_idx);
|
|
|
|
// When TopK is fulfilled and we are switching to a new group, stop consuming further ranges if possible.
|
|
// read from topk heap and determine whether we can stop earlier.
|
|
if !in_same_group && self.can_stop_early(&batch.schema())? {
|
|
self.input_complete = true;
|
|
self.evaluating_batch = None;
|
|
return Ok(());
|
|
}
|
|
|
|
// Transition to a new group if needed
|
|
if !in_same_group {
|
|
self.advance_to_next_group();
|
|
}
|
|
|
|
let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
|
|
if self.try_find_next_range(&next_sort_column)?.is_some() {
|
|
// remaining batch still contains data that exceeds the current partition range
|
|
// register the remaining batch for next polling
|
|
self.evaluating_batch = Some(remaining_range);
|
|
} else if remaining_range.num_rows() != 0 {
|
|
// remaining batch is within the current partition range
|
|
// push to the buffer and continue polling
|
|
self.push_buffer(remaining_range)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn split_batch_all(
|
|
&mut self,
|
|
batch: DfRecordBatch,
|
|
) -> datafusion_common::Result<Option<DfRecordBatch>> {
|
|
if batch.num_rows() == 0 {
|
|
return Ok(None);
|
|
}
|
|
|
|
let sort_column = self
|
|
.expression
|
|
.expr
|
|
.evaluate(&batch)?
|
|
.into_array(batch.num_rows())?;
|
|
|
|
let next_range_idx = self.try_find_next_range(&sort_column)?;
|
|
let Some(idx) = next_range_idx else {
|
|
self.push_buffer(batch)?;
|
|
// keep polling input for next batch
|
|
return Ok(None);
|
|
};
|
|
|
|
let this_range = batch.slice(0, idx);
|
|
let remaining_range = batch.slice(idx, batch.num_rows() - idx);
|
|
if this_range.num_rows() != 0 {
|
|
self.push_buffer(this_range)?;
|
|
}
|
|
|
|
// Step to next proper PartitionRange
|
|
self.cur_part_idx += 1;
|
|
|
|
// If we've processed all partitions, sort and output
|
|
if self.cur_part_idx >= self.partition_ranges.len() {
|
|
// assert there is no data beyond the last partition range (remaining is empty).
|
|
debug_assert!(remaining_range.num_rows() == 0);
|
|
|
|
// Sort and output the final group
|
|
return self.sorted_buffer_if_non_empty();
|
|
}
|
|
|
|
// Check if we're still in the same group
|
|
if self.is_in_current_group(self.cur_part_idx) {
|
|
// Same group - don't sort yet, keep collecting
|
|
let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
|
|
if self.try_find_next_range(&next_sort_column)?.is_some() {
|
|
// remaining batch still contains data that exceeds the current partition range
|
|
self.evaluating_batch = Some(remaining_range);
|
|
} else {
|
|
// remaining batch is within the current partition range
|
|
if remaining_range.num_rows() != 0 {
|
|
self.push_buffer(remaining_range)?;
|
|
}
|
|
}
|
|
// Return None to continue collecting within the same group
|
|
return Ok(None);
|
|
}
|
|
|
|
// Transitioning to a new group - sort current group and output
|
|
let sorted_batch = self.sorted_buffer_if_non_empty()?;
|
|
self.advance_to_next_group();
|
|
|
|
let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
|
|
if self.try_find_next_range(&next_sort_column)?.is_some() {
|
|
// remaining batch still contains data that exceeds the current partition range
|
|
// register the remaining batch for next polling
|
|
self.evaluating_batch = Some(remaining_range);
|
|
} else {
|
|
// remaining batch is within the current partition range
|
|
// push to the buffer and continue polling
|
|
if remaining_range.num_rows() != 0 {
|
|
self.push_buffer(remaining_range)?;
|
|
}
|
|
}
|
|
|
|
Ok(sorted_batch)
|
|
}
|
|
|
|
pub fn poll_next_inner(
|
|
mut self: Pin<&mut Self>,
|
|
cx: &mut Context<'_>,
|
|
) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
|
|
loop {
|
|
if self.input_complete {
|
|
if let Some(sorted_batch) = self.sorted_buffer_if_non_empty()? {
|
|
return Poll::Ready(Some(Ok(sorted_batch)));
|
|
}
|
|
return Poll::Ready(None);
|
|
}
|
|
|
|
// if there is a remaining batch being evaluated from last run,
|
|
// split on it instead of fetching new batch
|
|
if let Some(evaluating_batch) = self.evaluating_batch.take()
|
|
&& evaluating_batch.num_rows() != 0
|
|
{
|
|
// Check if we've already processed all partitions
|
|
if self.cur_part_idx >= self.partition_ranges.len() {
|
|
// All partitions processed, discard remaining data
|
|
if let Some(sorted_batch) = self.sorted_buffer_if_non_empty()? {
|
|
return Poll::Ready(Some(Ok(sorted_batch)));
|
|
}
|
|
return Poll::Ready(None);
|
|
}
|
|
|
|
if let Some(sorted_batch) = self.split_batch(evaluating_batch)? {
|
|
return Poll::Ready(Some(Ok(sorted_batch)));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// fetch next batch from input
|
|
let res = self.input.as_mut().poll_next(cx);
|
|
match res {
|
|
Poll::Ready(Some(Ok(batch))) => {
|
|
if let Some(sorted_batch) = self.split_batch(batch)? {
|
|
return Poll::Ready(Some(Ok(sorted_batch)));
|
|
}
|
|
}
|
|
// input stream end, mark and continue
|
|
Poll::Ready(None) => {
|
|
self.input_complete = true;
|
|
}
|
|
Poll::Ready(Some(Err(e))) => return Poll::Ready(Some(Err(e))),
|
|
Poll::Pending => return Poll::Pending,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Stream for PartSortStream {
|
|
type Item = datafusion_common::Result<DfRecordBatch>;
|
|
|
|
fn poll_next(
|
|
mut self: Pin<&mut Self>,
|
|
cx: &mut Context<'_>,
|
|
) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
|
|
let result = self.as_mut().poll_next_inner(cx);
|
|
self.metrics.record_poll(result)
|
|
}
|
|
}
|
|
|
|
impl RecordBatchStream for PartSortStream {
|
|
fn schema(&self) -> SchemaRef {
|
|
self.schema.clone()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use std::sync::Arc;
|
|
|
|
use arrow::array::{
|
|
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
|
TimestampSecondArray,
|
|
};
|
|
use arrow::json::ArrayWriter;
|
|
use arrow_schema::{DataType, Field, Schema, SortOptions, TimeUnit};
|
|
use common_time::Timestamp;
|
|
use datafusion_physical_expr::expressions::Column;
|
|
use futures::StreamExt;
|
|
use store_api::region_engine::PartitionRange;
|
|
|
|
use super::*;
|
|
use crate::test_util::{MockInputExec, new_ts_array};
|
|
|
|
#[tokio::test]
|
|
async fn test_can_stop_early_with_empty_topk_buffer() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Build a minimal PartSortExec and stream, but inject a dynamic filter that
|
|
// always evaluates to false so TopK will filter out all rows internally.
|
|
let mock_input = Arc::new(MockInputExec::new(vec![vec![]], schema.clone()));
|
|
let exec = PartSortExec::try_new(
|
|
PhysicalSortExpr {
|
|
expr: Arc::new(Column::new("ts", 0)),
|
|
options: SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
},
|
|
Some(3),
|
|
vec![vec![]],
|
|
mock_input.clone(),
|
|
)
|
|
.unwrap();
|
|
|
|
let filter = Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
|
|
DynamicFilterPhysicalExpr::new(vec![], lit(false)),
|
|
))));
|
|
|
|
let input_stream = mock_input
|
|
.execute(0, Arc::new(TaskContext::default()))
|
|
.unwrap();
|
|
let mut stream = PartSortStream::new(
|
|
Arc::new(TaskContext::default()),
|
|
&exec,
|
|
Some(3),
|
|
input_stream,
|
|
vec![],
|
|
0,
|
|
Some(filter),
|
|
)
|
|
.unwrap();
|
|
|
|
// Push 3 rows so the external counter reaches `limit`, while TopK keeps no rows.
|
|
let batch = DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
|
|
.unwrap();
|
|
stream.push_buffer(batch).unwrap();
|
|
|
|
// The TopK result buffer is empty, so we cannot determine early-stop.
|
|
// Ensure this path returns `Ok(false)` (and, importantly, does not panic).
|
|
assert!(!stream.can_stop_early(&schema).unwrap());
|
|
}
|
|
|
|
#[ignore = "hard to gen expected data correctly here, TODO(discord9): fix it later"]
|
|
#[tokio::test]
|
|
async fn fuzzy_test() {
|
|
let test_cnt = 100;
|
|
// bound for total count of PartitionRange
|
|
let part_cnt_bound = 100;
|
|
// bound for timestamp range size and offset for each PartitionRange
|
|
let range_size_bound = 100;
|
|
let range_offset_bound = 100;
|
|
// bound for batch count and size within each PartitionRange
|
|
let batch_cnt_bound = 20;
|
|
let batch_size_bound = 100;
|
|
|
|
let mut rng = fastrand::Rng::new();
|
|
rng.seed(1337);
|
|
|
|
let mut test_cases = Vec::new();
|
|
|
|
for case_id in 0..test_cnt {
|
|
let mut bound_val: Option<i64> = None;
|
|
let descending = rng.bool();
|
|
let nulls_first = rng.bool();
|
|
let opt = SortOptions {
|
|
descending,
|
|
nulls_first,
|
|
};
|
|
let limit = if rng.bool() {
|
|
Some(rng.usize(1..batch_cnt_bound * batch_size_bound))
|
|
} else {
|
|
None
|
|
};
|
|
let unit = match rng.u8(0..3) {
|
|
0 => TimeUnit::Second,
|
|
1 => TimeUnit::Millisecond,
|
|
2 => TimeUnit::Microsecond,
|
|
_ => TimeUnit::Nanosecond,
|
|
};
|
|
|
|
let schema = Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]);
|
|
let schema = Arc::new(schema);
|
|
|
|
let mut input_ranged_data = vec![];
|
|
let mut output_ranges = vec![];
|
|
let mut output_data = vec![];
|
|
// generate each input `PartitionRange`
|
|
for part_id in 0..rng.usize(0..part_cnt_bound) {
|
|
// generate each `PartitionRange`'s timestamp range
|
|
let (start, end) = if descending {
|
|
// Use 1..=range_offset_bound to ensure strictly decreasing end values
|
|
let end = bound_val
|
|
.map(
|
|
|i| i
|
|
.checked_sub(rng.i64(1..=range_offset_bound))
|
|
.expect("Bad luck, fuzzy test generate data that will overflow, change seed and try again")
|
|
)
|
|
.unwrap_or_else(|| rng.i64(-100000000..100000000));
|
|
bound_val = Some(end);
|
|
let start = end - rng.i64(1..range_size_bound);
|
|
let start = Timestamp::new(start, unit.into());
|
|
let end = Timestamp::new(end, unit.into());
|
|
(start, end)
|
|
} else {
|
|
// Use 1..=range_offset_bound to ensure strictly increasing start values
|
|
let start = bound_val
|
|
.map(|i| i + rng.i64(1..=range_offset_bound))
|
|
.unwrap_or_else(|| rng.i64(..));
|
|
bound_val = Some(start);
|
|
let end = start + rng.i64(1..range_size_bound);
|
|
let start = Timestamp::new(start, unit.into());
|
|
let end = Timestamp::new(end, unit.into());
|
|
(start, end)
|
|
};
|
|
assert!(start < end);
|
|
|
|
let mut per_part_sort_data = vec![];
|
|
let mut batches = vec![];
|
|
for _batch_idx in 0..rng.usize(1..batch_cnt_bound) {
|
|
let cnt = rng.usize(0..batch_size_bound) + 1;
|
|
let iter = 0..rng.usize(0..cnt);
|
|
let mut data_gen = iter
|
|
.map(|_| rng.i64(start.value()..end.value()))
|
|
.collect_vec();
|
|
if data_gen.is_empty() {
|
|
// current batch is empty, skip
|
|
continue;
|
|
}
|
|
// mito always sort on ASC order
|
|
data_gen.sort();
|
|
per_part_sort_data.extend(data_gen.clone());
|
|
let arr = new_ts_array(unit, data_gen.clone());
|
|
let batch = DfRecordBatch::try_new(schema.clone(), vec![arr]).unwrap();
|
|
batches.push(batch);
|
|
}
|
|
|
|
let range = PartitionRange {
|
|
start,
|
|
end,
|
|
num_rows: batches.iter().map(|b| b.num_rows()).sum(),
|
|
identifier: part_id,
|
|
};
|
|
input_ranged_data.push((range, batches));
|
|
|
|
output_ranges.push(range);
|
|
if per_part_sort_data.is_empty() {
|
|
continue;
|
|
}
|
|
output_data.extend_from_slice(&per_part_sort_data);
|
|
}
|
|
|
|
// adjust output data with adjacent PartitionRanges
|
|
let mut output_data_iter = output_data.iter().peekable();
|
|
let mut output_data = vec![];
|
|
for range in output_ranges.clone() {
|
|
let mut cur_data = vec![];
|
|
while let Some(val) = output_data_iter.peek() {
|
|
if **val < range.start.value() || **val >= range.end.value() {
|
|
break;
|
|
}
|
|
cur_data.push(*output_data_iter.next().unwrap());
|
|
}
|
|
|
|
if cur_data.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
if descending {
|
|
cur_data.sort_by(|a, b| b.cmp(a));
|
|
} else {
|
|
cur_data.sort();
|
|
}
|
|
output_data.push(cur_data);
|
|
}
|
|
|
|
let expected_output = if let Some(limit) = limit {
|
|
let mut accumulated = Vec::new();
|
|
let mut seen = 0usize;
|
|
for mut range_values in output_data {
|
|
seen += range_values.len();
|
|
accumulated.append(&mut range_values);
|
|
if seen >= limit {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if accumulated.is_empty() {
|
|
None
|
|
} else {
|
|
if descending {
|
|
accumulated.sort_by(|a, b| b.cmp(a));
|
|
} else {
|
|
accumulated.sort();
|
|
}
|
|
accumulated.truncate(limit.min(accumulated.len()));
|
|
|
|
Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, accumulated)],
|
|
)
|
|
.unwrap(),
|
|
)
|
|
}
|
|
} else {
|
|
let batches = output_data
|
|
.into_iter()
|
|
.map(|a| {
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, a)]).unwrap()
|
|
})
|
|
.collect_vec();
|
|
if batches.is_empty() {
|
|
None
|
|
} else {
|
|
Some(concat_batches(&schema, &batches).unwrap())
|
|
}
|
|
};
|
|
|
|
test_cases.push((
|
|
case_id,
|
|
unit,
|
|
input_ranged_data,
|
|
schema,
|
|
opt,
|
|
limit,
|
|
expected_output,
|
|
));
|
|
}
|
|
|
|
for (case_id, _unit, input_ranged_data, schema, opt, limit, expected_output) in test_cases {
|
|
run_test(
|
|
case_id,
|
|
input_ranged_data,
|
|
schema,
|
|
opt,
|
|
limit,
|
|
expected_output,
|
|
None,
|
|
)
|
|
.await;
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn simple_cases() {
|
|
let testcases = vec![
|
|
(
|
|
TimeUnit::Millisecond,
|
|
vec![
|
|
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8, 9]]),
|
|
((5, 10), vec![vec![5, 6], vec![7, 8]]),
|
|
],
|
|
false,
|
|
None,
|
|
vec![vec![1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9]],
|
|
),
|
|
// Case 1: Descending sort with overlapping ranges that have the same primary end (end=10).
|
|
// Ranges [5,10) and [0,10) are grouped together, so their data is merged before sorting.
|
|
(
|
|
TimeUnit::Millisecond,
|
|
vec![
|
|
((5, 10), vec![vec![5, 6], vec![7, 8, 9]]),
|
|
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
|
|
],
|
|
true,
|
|
None,
|
|
vec![vec![9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 3, 2, 1]],
|
|
),
|
|
(
|
|
TimeUnit::Millisecond,
|
|
vec![
|
|
((5, 10), vec![]),
|
|
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
|
|
],
|
|
true,
|
|
None,
|
|
vec![vec![8, 7, 6, 5, 4, 3, 2, 1]],
|
|
),
|
|
(
|
|
TimeUnit::Millisecond,
|
|
vec![
|
|
((15, 20), vec![vec![17, 18, 19]]),
|
|
((10, 15), vec![]),
|
|
((5, 10), vec![]),
|
|
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
|
|
],
|
|
true,
|
|
None,
|
|
vec![vec![19, 18, 17], vec![8, 7, 6, 5, 4, 3, 2, 1]],
|
|
),
|
|
(
|
|
TimeUnit::Millisecond,
|
|
vec![
|
|
((15, 20), vec![]),
|
|
((10, 15), vec![]),
|
|
((5, 10), vec![]),
|
|
((0, 10), vec![]),
|
|
],
|
|
true,
|
|
None,
|
|
vec![],
|
|
),
|
|
// Case 5: Data from one batch spans multiple ranges. Ranges with same end are grouped.
|
|
// Ranges: [15,20) end=20, [10,15) end=15, [5,10) end=10, [0,10) end=10
|
|
// Groups: {[15,20)}, {[10,15)}, {[5,10), [0,10)}
|
|
// The last two ranges are merged because they share end=10.
|
|
(
|
|
TimeUnit::Millisecond,
|
|
vec![
|
|
(
|
|
(15, 20),
|
|
vec![vec![15, 17, 19, 10, 11, 12, 5, 6, 7, 8, 9, 1, 2, 3, 4]],
|
|
),
|
|
((10, 15), vec![]),
|
|
((5, 10), vec![]),
|
|
((0, 10), vec![]),
|
|
],
|
|
true,
|
|
None,
|
|
vec![
|
|
vec![19, 17, 15],
|
|
vec![12, 11, 10],
|
|
vec![9, 8, 7, 6, 5, 4, 3, 2, 1],
|
|
],
|
|
),
|
|
(
|
|
TimeUnit::Millisecond,
|
|
vec![
|
|
(
|
|
(15, 20),
|
|
vec![vec![15, 17, 19, 10, 11, 12, 5, 6, 7, 8, 9, 1, 2, 3, 4]],
|
|
),
|
|
((10, 15), vec![]),
|
|
((5, 10), vec![]),
|
|
((0, 10), vec![]),
|
|
],
|
|
true,
|
|
Some(2),
|
|
vec![vec![19, 17]],
|
|
),
|
|
];
|
|
|
|
for (identifier, (unit, input_ranged_data, descending, limit, expected_output)) in
|
|
testcases.into_iter().enumerate()
|
|
{
|
|
let schema = Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]);
|
|
let schema = Arc::new(schema);
|
|
let opt = SortOptions {
|
|
descending,
|
|
..Default::default()
|
|
};
|
|
|
|
let input_ranged_data = input_ranged_data
|
|
.into_iter()
|
|
.map(|(range, data)| {
|
|
let part = PartitionRange {
|
|
start: Timestamp::new(range.0, unit.into()),
|
|
end: Timestamp::new(range.1, unit.into()),
|
|
num_rows: data.iter().map(|b| b.len()).sum(),
|
|
identifier,
|
|
};
|
|
|
|
let batches = data
|
|
.into_iter()
|
|
.map(|b| {
|
|
let arr = new_ts_array(unit, b);
|
|
DfRecordBatch::try_new(schema.clone(), vec![arr]).unwrap()
|
|
})
|
|
.collect_vec();
|
|
(part, batches)
|
|
})
|
|
.collect_vec();
|
|
|
|
let expected_output = expected_output
|
|
.into_iter()
|
|
.map(|a| {
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, a)]).unwrap()
|
|
})
|
|
.collect_vec();
|
|
let expected_output = if expected_output.is_empty() {
|
|
None
|
|
} else {
|
|
Some(concat_batches(&schema, &expected_output).unwrap())
|
|
};
|
|
|
|
run_test(
|
|
identifier,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
opt,
|
|
limit,
|
|
expected_output,
|
|
None,
|
|
)
|
|
.await;
|
|
}
|
|
}
|
|
|
|
#[allow(clippy::print_stdout)]
|
|
async fn run_test(
|
|
case_id: usize,
|
|
input_ranged_data: Vec<(PartitionRange, Vec<DfRecordBatch>)>,
|
|
schema: SchemaRef,
|
|
opt: SortOptions,
|
|
limit: Option<usize>,
|
|
expected_output: Option<DfRecordBatch>,
|
|
expected_polled_rows: Option<usize>,
|
|
) {
|
|
if let (Some(limit), Some(rb)) = (limit, &expected_output) {
|
|
assert!(
|
|
rb.num_rows() <= limit,
|
|
"Expect row count in expected output({}) <= limit({})",
|
|
rb.num_rows(),
|
|
limit
|
|
);
|
|
}
|
|
|
|
let mut data_partition = Vec::with_capacity(input_ranged_data.len());
|
|
let mut ranges = Vec::with_capacity(input_ranged_data.len());
|
|
for (part_range, batches) in input_ranged_data {
|
|
data_partition.push(batches);
|
|
ranges.push(part_range);
|
|
}
|
|
|
|
let mock_input = Arc::new(MockInputExec::new(data_partition, schema.clone()));
|
|
|
|
let exec = PartSortExec::try_new(
|
|
PhysicalSortExpr {
|
|
expr: Arc::new(Column::new("ts", 0)),
|
|
options: opt,
|
|
},
|
|
limit,
|
|
vec![ranges.clone()],
|
|
mock_input.clone(),
|
|
)
|
|
.unwrap();
|
|
|
|
let exec_stream = exec.execute(0, Arc::new(TaskContext::default())).unwrap();
|
|
|
|
let real_output = exec_stream.map(|r| r.unwrap()).collect::<Vec<_>>().await;
|
|
if limit.is_some() {
|
|
assert!(
|
|
real_output.len() <= 1,
|
|
"case_{case_id} expects a single output batch when limit is set, got {}",
|
|
real_output.len()
|
|
);
|
|
}
|
|
|
|
let actual_output = if real_output.is_empty() {
|
|
None
|
|
} else {
|
|
Some(concat_batches(&schema, &real_output).unwrap())
|
|
};
|
|
|
|
if let Some(expected_polled_rows) = expected_polled_rows {
|
|
let input_pulled_rows = mock_input.metrics().unwrap().output_rows().unwrap();
|
|
assert_eq!(input_pulled_rows, expected_polled_rows);
|
|
}
|
|
|
|
match (actual_output, expected_output) {
|
|
(None, None) => {}
|
|
(Some(actual), Some(expected)) => {
|
|
if actual != expected {
|
|
let mut actual_json: Vec<u8> = Vec::new();
|
|
let mut writer = ArrayWriter::new(&mut actual_json);
|
|
writer.write(&actual).unwrap();
|
|
writer.finish().unwrap();
|
|
|
|
let mut expected_json: Vec<u8> = Vec::new();
|
|
let mut writer = ArrayWriter::new(&mut expected_json);
|
|
writer.write(&expected).unwrap();
|
|
writer.finish().unwrap();
|
|
|
|
panic!(
|
|
"case_{} failed (limit {limit:?}), opt: {:?},\nreal_output: {}\nexpected: {}",
|
|
case_id,
|
|
opt,
|
|
String::from_utf8_lossy(&actual_json),
|
|
String::from_utf8_lossy(&expected_json),
|
|
);
|
|
}
|
|
}
|
|
(None, Some(expected)) => panic!(
|
|
"case_{} failed (limit {limit:?}), opt: {:?},\nreal output is empty, expected {} rows",
|
|
case_id,
|
|
opt,
|
|
expected.num_rows()
|
|
),
|
|
(Some(actual), None) => panic!(
|
|
"case_{} failed (limit {limit:?}), opt: {:?},\nreal output has {} rows, expected empty",
|
|
case_id,
|
|
opt,
|
|
actual.num_rows()
|
|
),
|
|
}
|
|
}
|
|
|
|
/// Test that verifies the limit is correctly applied per partition when
|
|
/// multiple batches are received for the same partition.
|
|
#[tokio::test]
|
|
async fn test_limit_with_multiple_batches_per_partition() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Test case: Multiple batches in a single partition with limit=3
|
|
// Input: 3 batches with [1,2,3], [4,5,6], [7,8,9] all in partition (0,10)
|
|
// Expected: Only top 3 values [9,8,7] for descending sort
|
|
let input_ranged_data = vec![(
|
|
PartitionRange {
|
|
start: Timestamp::new(0, unit.into()),
|
|
end: Timestamp::new(10, unit.into()),
|
|
num_rows: 9,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5, 6])])
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![7, 8, 9])])
|
|
.unwrap(),
|
|
],
|
|
)];
|
|
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![9, 8, 7])])
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
1000,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(3),
|
|
expected_output,
|
|
None,
|
|
)
|
|
.await;
|
|
|
|
// Test case: Multiple batches across multiple partitions with limit=2
|
|
// Partition 0: batches [10,11,12], [13,14,15] -> top 2 descending = [15,14]
|
|
// Partition 1: batches [1,2,3], [4,5] -> top 2 descending = [5,4]
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(10, unit.into()),
|
|
end: Timestamp::new(20, unit.into()),
|
|
num_rows: 6,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![10, 11, 12])],
|
|
)
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![13, 14, 15])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(0, unit.into()),
|
|
end: Timestamp::new(10, unit.into()),
|
|
num_rows: 5,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![15, 14])]).unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
1001,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(2),
|
|
expected_output,
|
|
None,
|
|
)
|
|
.await;
|
|
|
|
// Test case: Ascending sort with limit
|
|
// Partition: batches [7,8,9], [4,5,6], [1,2,3] -> top 2 ascending = [1,2]
|
|
let input_ranged_data = vec![(
|
|
PartitionRange {
|
|
start: Timestamp::new(0, unit.into()),
|
|
end: Timestamp::new(10, unit.into()),
|
|
num_rows: 9,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![7, 8, 9])])
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5, 6])])
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
|
|
.unwrap(),
|
|
],
|
|
)];
|
|
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2])]).unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
1002,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: false,
|
|
..Default::default()
|
|
},
|
|
Some(2),
|
|
expected_output,
|
|
None,
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test that verifies early termination behavior.
|
|
/// Once we've produced limit * num_partitions rows, we should stop
|
|
/// pulling from input stream.
|
|
#[tokio::test]
|
|
async fn test_early_termination() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Create 3 partitions, each with more data than the limit
|
|
// limit=2 per partition, so total expected output = 6 rows
|
|
// After producing 6 rows, early termination should kick in
|
|
// For descending sort, ranges must be ordered by (end DESC, start DESC)
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(20, unit.into()),
|
|
end: Timestamp::new(30, unit.into()),
|
|
num_rows: 10,
|
|
identifier: 2,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![21, 22, 23, 24, 25])],
|
|
)
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![26, 27, 28, 29, 30])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(10, unit.into()),
|
|
end: Timestamp::new(20, unit.into()),
|
|
num_rows: 10,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![11, 12, 13, 14, 15])],
|
|
)
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![16, 17, 18, 19, 20])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(0, unit.into()),
|
|
end: Timestamp::new(10, unit.into()),
|
|
num_rows: 10,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![1, 2, 3, 4, 5])],
|
|
)
|
|
.unwrap(),
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![6, 7, 8, 9, 10])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// PartSort won't reorder `PartitionRange` (it assumes it's already ordered), so it will not read other partitions.
|
|
// This case is just to verify that early termination works as expected.
|
|
// First partition [20, 30) produces top 2 values: 29, 28
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![29, 28])]).unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
1003,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(2),
|
|
expected_output,
|
|
Some(10),
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Example:
|
|
/// - Range [70, 100) has data [80, 90, 95]
|
|
/// - Range [50, 100) has data [55, 65, 75, 85, 95]
|
|
#[tokio::test]
|
|
async fn test_primary_end_grouping_with_limit() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Two ranges with the same end (100) - they should be grouped together
|
|
// For descending, ranges are ordered by (end DESC, start DESC)
|
|
// So [70, 100) comes before [50, 100) (70 > 50)
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![80, 90, 95])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 5,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![55, 65, 75, 85, 95])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With limit=4, descending: top 4 values from combined data
|
|
// Combined: [80, 90, 95, 55, 65, 75, 85, 95] -> sorted desc: [95, 95, 90, 85, 80, 75, 65, 55]
|
|
// Top 4: [95, 95, 90, 85]
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![95, 95, 90, 85])],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
2000,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(4),
|
|
expected_output,
|
|
None,
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test case with three ranges demonstrating the "keep pulling" behavior.
|
|
/// After processing ranges with end=100, the smallest value in top-k might still
|
|
/// be reachable by the next group.
|
|
///
|
|
/// Ranges: [70, 100), [50, 100), [40, 95)
|
|
/// With descending sort and limit=4:
|
|
/// - Group 1 (end=100): [70, 100) and [50, 100) merged
|
|
/// - Group 2 (end=95): [40, 95)
|
|
/// After group 1, smallest in top-4 is 85. Range [40, 95) could have values >= 85,
|
|
/// so we continue to group 2.
|
|
#[tokio::test]
|
|
async fn test_three_ranges_keep_pulling() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Three ranges, two with same end (100), one with different end (95)
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![80, 90, 95])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![55, 75, 85])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(40, unit.into()),
|
|
end: Timestamp::new(95, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 2,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![45, 65, 94])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// All data: [80, 90, 95, 55, 75, 85, 45, 65, 94]
|
|
// Sorted descending: [95, 94, 90, 85, 80, 75, 65, 55, 45]
|
|
// With limit=4: should be top 4 largest values across all ranges: [95, 94, 90, 85]
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![95, 94, 90, 85])],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
2001,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(4),
|
|
expected_output,
|
|
None,
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test early termination based on threshold comparison with next group.
|
|
/// When the threshold (smallest value for descending) is >= next group's primary end,
|
|
/// we can stop early because the next group cannot have better values.
|
|
#[tokio::test]
|
|
async fn test_threshold_based_early_termination() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Group 1 (end=100) has 6 rows, TopK will keep top 4
|
|
// Group 2 (end=90) has 3 rows - should NOT be processed because
|
|
// threshold (96) >= next_primary_end (90)
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 6,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![94, 95, 96, 97, 98, 99])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(90, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![85, 86, 87])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With limit=4, descending: top 4 from group 1 are [99, 98, 97, 96]
|
|
// Threshold is 96, next group's primary_end is 90
|
|
// Since 96 >= 90, we stop after group 1
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![99, 98, 97, 96])],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
2002,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(4),
|
|
expected_output,
|
|
Some(9), // Pull both batches since all rows fall within the first range
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test that we continue to next group when threshold is within next group's range.
|
|
/// Even after fulfilling limit, if threshold < next_primary_end (descending),
|
|
/// we would need to continue... but limit exhaustion stops us first.
|
|
#[tokio::test]
|
|
async fn test_continue_when_threshold_in_next_group_range() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Group 1 (end=100) has 6 rows, TopK will keep top 4
|
|
// Group 2 (end=98) has 3 rows - threshold (96) < 98, so next group
|
|
// could theoretically have better values. Continue reading.
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(90, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 6,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![94, 95, 96, 97, 98, 99])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(98, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
// Values must be < 70 (outside group 1's range) to avoid ambiguity
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![55, 60, 65])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With limit=4, we get [99, 98, 97, 96] from group 1
|
|
// Threshold is 96, next group's primary_end is 98
|
|
// 96 < 98, so threshold check says "could continue"
|
|
// But limit is exhausted (0), so we stop anyway
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![99, 98, 97, 96])],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
// Note: We pull 9 rows (both batches) because we need to read batch 2
|
|
// to detect the group boundary, even though we stop after outputting group 1.
|
|
run_test(
|
|
2003,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(4),
|
|
expected_output,
|
|
Some(9), // Pull both batches to detect boundary
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test ascending sort with threshold-based early termination.
|
|
#[tokio::test]
|
|
async fn test_ascending_threshold_early_termination() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// For ascending: primary_end is start, ranges sorted by (start ASC, end ASC)
|
|
// Group 1 (start=10) has 6 rows
|
|
// Group 2 (start=20) has 3 rows - should NOT be processed because
|
|
// threshold (13) < next_primary_end (20)
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(10, unit.into()),
|
|
end: Timestamp::new(50, unit.into()),
|
|
num_rows: 6,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![10, 11, 12, 13, 14, 15])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(20, unit.into()),
|
|
end: Timestamp::new(60, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![25, 30, 35])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
// still read this batch to detect group boundary(?)
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(60, unit.into()),
|
|
end: Timestamp::new(70, unit.into()),
|
|
num_rows: 2,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![60, 61])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
// after boundary detected, this following one should not be read
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(61, unit.into()),
|
|
end: Timestamp::new(70, unit.into()),
|
|
num_rows: 2,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![71, 72])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With limit=4, ascending: top 4 (smallest) from group 1 are [10, 11, 12, 13]
|
|
// Threshold is 13 (largest in top-k), next group's primary_end is 20
|
|
// Since 13 < 20, we stop after group 1 (no value in group 2 can be < 13)
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![10, 11, 12, 13])],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
2004,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: false,
|
|
..Default::default()
|
|
},
|
|
Some(4),
|
|
expected_output,
|
|
Some(11), // Pull first two batches to detect boundary
|
|
)
|
|
.await;
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_ascending_threshold_early_termination_case_two() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// For ascending: primary_end is start, ranges sorted by (start ASC, end ASC)
|
|
// Group 1 (start=0) has 4 rows, Group 2 (start=4) has 1 row, Group 3 (start=5) has 4 rows
|
|
// After reading all data: [9,10,11,12, 21, 5,6,7,8]
|
|
// Sorted ascending: [5,6,7,8, 9,10,11,12, 21]
|
|
// With limit=4, output should be smallest 4: [5,6,7,8]
|
|
// Algorithm continues reading until start=42 > threshold=8, confirming no smaller values exist
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(0, unit.into()),
|
|
end: Timestamp::new(20, unit.into()),
|
|
num_rows: 4,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![9, 10, 11, 12])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(4, unit.into()),
|
|
end: Timestamp::new(25, unit.into()),
|
|
num_rows: 1,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![21])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(5, unit.into()),
|
|
end: Timestamp::new(25, unit.into()),
|
|
num_rows: 4,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![5, 6, 7, 8])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
// This still will be read to detect boundary, but should not contribute to output
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(42, unit.into()),
|
|
end: Timestamp::new(52, unit.into()),
|
|
num_rows: 2,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![42, 51])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
// This following one should not be read after boundary detected
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(48, unit.into()),
|
|
end: Timestamp::new(53, unit.into()),
|
|
num_rows: 2,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![48, 51])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With limit=4, ascending: after processing all ranges, smallest 4 are [5, 6, 7, 8]
|
|
// Threshold is 8 (4th smallest value), algorithm reads until start=42 > threshold=8
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![5, 6, 7, 8])])
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
2005,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: false,
|
|
..Default::default()
|
|
},
|
|
Some(4),
|
|
expected_output,
|
|
Some(11), // Read first 4 ranges to confirm threshold boundary
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test early stop behavior with null values in sort column.
|
|
/// Verifies that nulls are handled correctly based on nulls_first option.
|
|
#[tokio::test]
|
|
async fn test_early_stop_with_nulls() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
true, // nullable
|
|
)]));
|
|
|
|
// Helper function to create nullable timestamp array
|
|
let new_nullable_ts_array = |unit: TimeUnit, arr: Vec<Option<i64>>| -> ArrayRef {
|
|
match unit {
|
|
TimeUnit::Second => Arc::new(TimestampSecondArray::from(arr)) as ArrayRef,
|
|
TimeUnit::Millisecond => Arc::new(TimestampMillisecondArray::from(arr)) as ArrayRef,
|
|
TimeUnit::Microsecond => Arc::new(TimestampMicrosecondArray::from(arr)) as ArrayRef,
|
|
TimeUnit::Nanosecond => Arc::new(TimestampNanosecondArray::from(arr)) as ArrayRef,
|
|
}
|
|
};
|
|
|
|
// Test case 1: nulls_first=true, null values should appear first
|
|
// Group 1 (end=100): [null, null, 99, 98, 97] -> with limit=3, top 3 are [null, null, 99]
|
|
// Threshold is 99, next group end=90, since 99 >= 90, we should stop early
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 5,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_nullable_ts_array(
|
|
unit,
|
|
vec![Some(99), Some(98), None, Some(97), None],
|
|
)],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(90, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_nullable_ts_array(
|
|
unit,
|
|
vec![Some(89), Some(88), Some(87)],
|
|
)],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With nulls_first=true, nulls sort before all values
|
|
// For descending, order is: null, null, 99, 98, 97
|
|
// With limit=3, we get: null, null, 99
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_nullable_ts_array(unit, vec![None, None, Some(99)])],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
3000,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
nulls_first: true,
|
|
},
|
|
Some(3),
|
|
expected_output,
|
|
Some(8), // Must read both batches to detect group boundary
|
|
)
|
|
.await;
|
|
|
|
// Test case 2: nulls_last=true, null values should appear last
|
|
// Group 1 (end=100): [99, 98, 97, null, null] -> with limit=3, top 3 are [99, 98, 97]
|
|
// Threshold is 97, next group end=90, since 97 >= 90, we should stop early
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 5,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_nullable_ts_array(
|
|
unit,
|
|
vec![Some(99), Some(98), Some(97), None, None],
|
|
)],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(90, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_nullable_ts_array(
|
|
unit,
|
|
vec![Some(89), Some(88), Some(87)],
|
|
)],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With nulls_last=false (equivalent to nulls_first=false), values sort before nulls
|
|
// For descending, order is: 99, 98, 97, null, null
|
|
// With limit=3, we get: 99, 98, 97
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_nullable_ts_array(
|
|
unit,
|
|
vec![Some(99), Some(98), Some(97)],
|
|
)],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
3001,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
nulls_first: false,
|
|
},
|
|
Some(3),
|
|
expected_output,
|
|
Some(8), // Must read both batches to detect group boundary
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test early stop behavior when there's only one group (no next group).
|
|
/// In this case, can_stop_early should return false and we should process all data.
|
|
#[tokio::test]
|
|
async fn test_early_stop_single_group() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Only one group (all ranges have the same end), no next group to compare against
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 6,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![94, 95, 96, 97, 98, 99])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![85, 86, 87])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// Even though we have enough data in first range, we must process all
|
|
// because there's no next group to compare threshold against
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![99, 98, 97, 96])],
|
|
)
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
3002,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(4),
|
|
expected_output,
|
|
Some(9), // Must read all batches since no early stop is possible
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test early stop behavior when threshold exactly equals next group's boundary.
|
|
#[tokio::test]
|
|
async fn test_early_stop_exact_boundary_equality() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Test case 1: Descending sort, threshold == next_group_end
|
|
// Group 1 (end=100): data up to 90, threshold = 90, next_group_end = 90
|
|
// Since 90 >= 90, we should stop early
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 4,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![92, 91, 90, 89])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(90, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![88, 87, 86])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![92, 91, 90])])
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
3003,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(3),
|
|
expected_output,
|
|
Some(7), // Must read both batches to detect boundary
|
|
)
|
|
.await;
|
|
|
|
// Test case 2: Ascending sort, threshold == next_group_start
|
|
// Group 1 (start=10): data from 10, threshold = 20, next_group_start = 20
|
|
// Since 20 < 20 is false, we should continue
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(10, unit.into()),
|
|
end: Timestamp::new(50, unit.into()),
|
|
num_rows: 4,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![10, 15, 20, 25])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(20, unit.into()),
|
|
end: Timestamp::new(60, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![21, 22, 23])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![10, 15, 20])])
|
|
.unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
3004,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: false,
|
|
..Default::default()
|
|
},
|
|
Some(3),
|
|
expected_output,
|
|
Some(7), // Must read both batches since 20 is not < 20
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// Test early stop behavior with empty partition groups.
|
|
#[tokio::test]
|
|
async fn test_early_stop_with_empty_partitions() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
// Test case 1: First group is empty, second group has data
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 0,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
// Empty batch for first range
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 0,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
// Empty batch for second range
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(30, unit.into()),
|
|
end: Timestamp::new(80, unit.into()),
|
|
num_rows: 4,
|
|
identifier: 2,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![74, 75, 76, 77])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(10, unit.into()),
|
|
end: Timestamp::new(60, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 3,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![58, 59, 60])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// Group 1 (end=100) is empty, Group 2 (end=80) has data
|
|
// Should continue to Group 2 since Group 1 has no data
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![77, 76])]).unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
3005,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(2),
|
|
expected_output,
|
|
Some(7), // Must read until finding actual data
|
|
)
|
|
.await;
|
|
|
|
// Test case 2: Empty partitions between data groups
|
|
let input_ranged_data = vec![
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(70, unit.into()),
|
|
end: Timestamp::new(100, unit.into()),
|
|
num_rows: 4,
|
|
identifier: 0,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![96, 97, 98, 99])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(50, unit.into()),
|
|
end: Timestamp::new(90, unit.into()),
|
|
num_rows: 0,
|
|
identifier: 1,
|
|
},
|
|
vec![
|
|
// Empty range - should be skipped
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(30, unit.into()),
|
|
end: Timestamp::new(70, unit.into()),
|
|
num_rows: 0,
|
|
identifier: 2,
|
|
},
|
|
vec![
|
|
// Another empty range
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
|
|
.unwrap(),
|
|
],
|
|
),
|
|
(
|
|
PartitionRange {
|
|
start: Timestamp::new(10, unit.into()),
|
|
end: Timestamp::new(50, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 3,
|
|
},
|
|
vec![
|
|
DfRecordBatch::try_new(
|
|
schema.clone(),
|
|
vec![new_ts_array(unit, vec![48, 49, 50])],
|
|
)
|
|
.unwrap(),
|
|
],
|
|
),
|
|
];
|
|
|
|
// With limit=2 from group 1: [99, 98], threshold=98, next group end=50
|
|
// Since 98 >= 50, we should stop early
|
|
let expected_output = Some(
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![99, 98])]).unwrap(),
|
|
);
|
|
|
|
run_test(
|
|
3006,
|
|
input_ranged_data,
|
|
schema.clone(),
|
|
SortOptions {
|
|
descending: true,
|
|
..Default::default()
|
|
},
|
|
Some(2),
|
|
expected_output,
|
|
Some(7), // Must read to detect early stop condition
|
|
)
|
|
.await;
|
|
}
|
|
|
|
/// First group: [0,20), data: [0, 5, 15]
|
|
/// Second group: [10, 30), data: [21, 25, 29]
|
|
/// after first group, calling early stop manually, and check if filter is updated
|
|
#[tokio::test]
|
|
async fn test_early_stop_check_update_dyn_filter() {
|
|
let unit = TimeUnit::Millisecond;
|
|
let schema = Arc::new(Schema::new(vec![Field::new(
|
|
"ts",
|
|
DataType::Timestamp(unit, None),
|
|
false,
|
|
)]));
|
|
|
|
let mock_input = Arc::new(MockInputExec::new(vec![vec![]], schema.clone()));
|
|
let exec = PartSortExec::try_new(
|
|
PhysicalSortExpr {
|
|
expr: Arc::new(Column::new("ts", 0)),
|
|
options: SortOptions {
|
|
descending: false,
|
|
..Default::default()
|
|
},
|
|
},
|
|
Some(3),
|
|
vec![vec![
|
|
PartitionRange {
|
|
start: Timestamp::new(0, unit.into()),
|
|
end: Timestamp::new(20, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
PartitionRange {
|
|
start: Timestamp::new(10, unit.into()),
|
|
end: Timestamp::new(30, unit.into()),
|
|
num_rows: 3,
|
|
identifier: 1,
|
|
},
|
|
]],
|
|
mock_input.clone(),
|
|
)
|
|
.unwrap();
|
|
|
|
let filter = exec.filter.clone().unwrap();
|
|
let input_stream = mock_input
|
|
.execute(0, Arc::new(TaskContext::default()))
|
|
.unwrap();
|
|
let mut stream = PartSortStream::new(
|
|
Arc::new(TaskContext::default()),
|
|
&exec,
|
|
Some(3),
|
|
input_stream,
|
|
vec![],
|
|
0,
|
|
Some(filter.clone()),
|
|
)
|
|
.unwrap();
|
|
|
|
// initially, snapshot_generation is 1
|
|
assert_eq!(filter.read().expr().snapshot_generation(), 1);
|
|
let batch =
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![0, 5, 15])])
|
|
.unwrap();
|
|
stream.push_buffer(batch).unwrap();
|
|
|
|
// after pushing first batch, snapshot_generation is updated to 2
|
|
assert_eq!(filter.read().expr().snapshot_generation(), 2);
|
|
assert!(!stream.can_stop_early(&schema).unwrap());
|
|
// still two as not updated
|
|
assert_eq!(filter.read().expr().snapshot_generation(), 2);
|
|
|
|
let _ = stream.sort_top_buffer().unwrap();
|
|
|
|
let batch =
|
|
DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![21, 25, 29])])
|
|
.unwrap();
|
|
stream.push_buffer(batch).unwrap();
|
|
// still two as not updated
|
|
assert_eq!(filter.read().expr().snapshot_generation(), 2);
|
|
let new = stream.sort_top_buffer().unwrap();
|
|
// still two as not updated
|
|
assert_eq!(filter.read().expr().snapshot_generation(), 2);
|
|
|
|
// dyn filter kick in, and filter out all rows >= 15(the filter is rows<15)
|
|
assert_eq!(new.num_rows(), 0)
|
|
}
|
|
}
|