feat: optimizer rule for windowed sort (#4874)

* basic impl

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement physical rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* feat: install windowed sort physical rule and optimize partition ranges

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add logs and sqlness test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* feat: introduce PartSortExec for partitioned sorting

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* tune exec nodes' properties and metrics

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* debug: add more info on very wrong

* debug: also print overlap ranges

* feat: add check when emit PartSort Stream

* dbg: info on overlap working range

* feat: check batch range is inside part range

* set distinguish partition range param

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* chore: more logs

* update sqlness

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* tune optimizer

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix lints

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix windowed sort rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix: early terminate sort stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* chore: remove min/max check

* chore: remove unused windowed_sort module, uuid feature and refactor region_scanner to synchronous

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* chore: print more fuzz log

* chore: more log

* fix: part sort should skip empty part

* chore: remove insert logs

* tests: empty PartitionRange

* refactor: testcase

* docs: update comment&tests: all empty

* ci: enlarge etcd cpu limit

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: discord9 <discord9@163.com>
Co-authored-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
Ruihang Xia
2024-10-29 15:46:05 +08:00
committed by GitHub
parent 0ee455a980
commit 03f2fa219d
21 changed files with 930 additions and 230 deletions

View File

@@ -18,7 +18,7 @@ runs:
--set replicaCount=${{ inputs.etcd-replicas }} \
--set resources.requests.cpu=50m \
--set resources.requests.memory=128Mi \
--set resources.limits.cpu=1000m \
--set resources.limits.cpu=1500m \
--set resources.limits.memory=2Gi \
--set auth.rbac.create=false \
--set auth.rbac.token.enabled=false \

1
Cargo.lock generated
View File

@@ -9030,6 +9030,7 @@ dependencies = [
"table",
"tokio",
"tokio-stream",
"uuid",
]
[[package]]

View File

@@ -163,13 +163,13 @@ impl MitoEngine {
}
/// Returns a region scanner to scan the region for `request`.
async fn region_scanner(
fn region_scanner(
&self,
region_id: RegionId,
request: ScanRequest,
) -> Result<RegionScannerRef> {
let scanner = self.scanner(region_id, request)?;
scanner.region_scanner().await
scanner.region_scanner()
}
/// Scans a region.
@@ -527,7 +527,6 @@ impl RegionEngine for MitoEngine {
request: ScanRequest,
) -> Result<RegionScannerRef, BoxedError> {
self.region_scanner(region_id, request)
.await
.map_err(BoxedError::new)
}

View File

@@ -532,10 +532,24 @@ impl Batch {
#[derive(Default)]
pub(crate) struct BatchChecker {
last_batch: Option<Batch>,
start: Option<Timestamp>,
end: Option<Timestamp>,
}
#[cfg(debug_assertions)]
impl BatchChecker {
/// Attaches the given start timestamp to the checker.
pub(crate) fn with_start(mut self, start: Option<Timestamp>) -> Self {
self.start = start;
self
}
/// Attaches the given end timestamp to the checker.
pub(crate) fn with_end(mut self, end: Option<Timestamp>) -> Self {
self.end = end;
self
}
/// Returns true if the given batch is monotonic and behind
/// the last batch.
pub(crate) fn check_monotonic(&mut self, batch: &Batch) -> bool {
@@ -543,6 +557,17 @@ impl BatchChecker {
return false;
}
if let (Some(start), Some(first)) = (self.start, batch.first_timestamp()) {
if start > first {
return false;
}
}
if let (Some(end), Some(last)) = (self.end, batch.last_timestamp()) {
if end <= last {
return false;
}
}
// Checks the batch is behind the last batch.
// Then Updates the last batch.
let is_behind = self

View File

@@ -74,7 +74,7 @@ impl Scanner {
/// Returns a [RegionScanner] to scan the region.
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
pub(crate) async fn region_scanner(self) -> Result<RegionScannerRef> {
pub(crate) fn region_scanner(self) -> Result<RegionScannerRef> {
match self {
Scanner::Seq(seq_scan) => Ok(Box::new(seq_scan)),
Scanner::Unordered(unordered_scan) => Ok(Box::new(unordered_scan)),

View File

@@ -232,7 +232,10 @@ impl SeqScan {
let mut metrics = ScannerMetrics::default();
let mut fetch_start = Instant::now();
#[cfg(debug_assertions)]
let mut checker = crate::read::BatchChecker::default();
let mut checker = crate::read::BatchChecker::default()
.with_start(Some(part_range.start))
.with_end(Some(part_range.end));
while let Some(batch) = reader
.next_batch()
.await
@@ -304,8 +307,13 @@ impl RegionScanner for SeqScan {
self.scan_partition_impl(partition)
}
fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
fn prepare(
&mut self,
ranges: Vec<Vec<PartitionRange>>,
distinguish_partition_range: bool,
) -> Result<(), BoxedError> {
self.properties.partitions = ranges;
self.properties.distinguish_partition_range = distinguish_partition_range;
Ok(())
}

View File

@@ -140,7 +140,9 @@ impl UnorderedScan {
let mut metrics = ScannerMetrics::default();
let mut fetch_start = Instant::now();
#[cfg(debug_assertions)]
let mut checker = crate::read::BatchChecker::default();
let mut checker = crate::read::BatchChecker::default()
.with_start(Some(part_range.start))
.with_end(Some(part_range.end));
let stream = Self::scan_partition_range(
stream_ctx.clone(),
@@ -209,8 +211,13 @@ impl RegionScanner for UnorderedScan {
self.stream_ctx.input.mapper.output_schema()
}
fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
fn prepare(
&mut self,
ranges: Vec<Vec<PartitionRange>>,
distinguish_partition_range: bool,
) -> Result<(), BoxedError> {
self.properties.partitions = ranges;
self.properties.distinguish_partition_range = distinguish_partition_range;
Ok(())
}

View File

@@ -64,6 +64,7 @@ store-api.workspace = true
substrait.workspace = true
table.workspace = true
tokio.workspace = true
uuid.workspace = true
[dev-dependencies]
approx_eq = "0.1"

View File

@@ -15,6 +15,7 @@
#![feature(let_chains)]
#![feature(int_roundings)]
#![feature(trait_upcasting)]
#![feature(try_blocks)]
mod analyze;
pub mod dataframe;

View File

@@ -20,6 +20,7 @@ pub mod string_normalization;
#[cfg(test)]
pub(crate) mod test_util;
pub mod type_conversion;
pub mod windowed_sort;
use datafusion_common::config::ConfigOptions;
use datafusion_common::Result;

View File

@@ -51,16 +51,27 @@ impl ParallelizeScan {
let result = plan
.transform_down(|plan| {
if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
if region_scan_exec.is_partition_set() {
return Ok(Transformed::no(plan));
}
let ranges = region_scan_exec.get_partition_ranges();
let total_range_num = ranges.len();
let expected_partition_num = config.execution.target_partitions;
// assign ranges to each partition
let partition_ranges =
let mut partition_ranges =
Self::assign_partition_range(ranges, expected_partition_num);
debug!(
"Assign {total_range_num} ranges to {expected_partition_num} partitions"
);
// sort the ranges in each partition
// TODO(ruihang): smart sort!
for ranges in partition_ranges.iter_mut() {
ranges.sort_by(|a, b| a.start.cmp(&b.start));
}
// update the partition ranges
let new_exec = region_scan_exec
.with_new_partitions(partition_ranges)

View File

@@ -0,0 +1,158 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use datafusion::physical_optimizer::PhysicalOptimizerRule;
use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
use datafusion::physical_plan::repartition::RepartitionExec;
use datafusion::physical_plan::sorts::sort::SortExec;
use datafusion::physical_plan::ExecutionPlan;
use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_common::Result as DataFusionResult;
use datafusion_physical_expr::expressions::Column as PhysicalColumn;
use store_api::region_engine::PartitionRange;
use table::table::scan::RegionScanExec;
use crate::part_sort::PartSortExec;
use crate::window_sort::WindowedSortExec;
/// Optimize rule for windowed sort.
///
/// This is expected to run after [`ScanHint`] and [`ParallelizeScan`].
/// It would change the original sort to a custom plan. To make sure
/// other rules are applied correctly, this rule can be run as later as
/// possible.
///
/// [`ScanHint`]: crate::optimizer::scan_hint::ScanHintRule
/// [`ParallelizeScan`]: crate::optimizer::parallelize_scan::ParallelizeScan
pub struct WindowedSortPhysicalRule;
impl PhysicalOptimizerRule for WindowedSortPhysicalRule {
fn optimize(
&self,
plan: Arc<dyn ExecutionPlan>,
config: &datafusion::config::ConfigOptions,
) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
Self::do_optimize(plan, config)
}
fn name(&self) -> &str {
"WindowedSortRule"
}
fn schema_check(&self) -> bool {
false
}
}
impl WindowedSortPhysicalRule {
fn do_optimize(
plan: Arc<dyn ExecutionPlan>,
_config: &datafusion::config::ConfigOptions,
) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
let result = plan
.transform_down(|plan| {
if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
// TODO: support multiple expr in windowed sort
if !sort_exec.preserve_partitioning() || sort_exec.expr().len() != 1 {
return Ok(Transformed::no(plan));
}
let Some(scanner_info) = fetch_partition_range(sort_exec.input().clone())?
else {
return Ok(Transformed::no(plan));
};
if let Some(first_sort_expr) = sort_exec.expr().first()
&& !first_sort_expr.options.descending
&& let Some(column_expr) = first_sort_expr
.expr
.as_any()
.downcast_ref::<PhysicalColumn>()
&& column_expr.name() == scanner_info.time_index
{
} else {
return Ok(Transformed::no(plan));
}
let first_sort_expr = sort_exec.expr().first().unwrap().clone();
let part_sort_exec = Arc::new(PartSortExec::new(
first_sort_expr.clone(),
scanner_info.partition_ranges.clone(),
sort_exec.input().clone(),
));
let windowed_sort_exec = WindowedSortExec::try_new(
first_sort_expr,
sort_exec.fetch(),
scanner_info.partition_ranges,
part_sort_exec,
)?;
return Ok(Transformed {
data: Arc::new(windowed_sort_exec),
transformed: true,
tnr: datafusion_common::tree_node::TreeNodeRecursion::Stop,
});
}
Ok(Transformed::no(plan))
})?
.data;
Ok(result)
}
}
struct ScannerInfo {
partition_ranges: Vec<Vec<PartitionRange>>,
time_index: String,
}
fn fetch_partition_range(input: Arc<dyn ExecutionPlan>) -> DataFusionResult<Option<ScannerInfo>> {
let mut partition_ranges = None;
let mut time_index = None;
input.transform_up(|plan| {
// Unappliable case, reset the state.
if plan.as_any().is::<RepartitionExec>()
|| plan.as_any().is::<CoalesceBatchesExec>()
|| plan.as_any().is::<CoalescePartitionsExec>()
|| plan.as_any().is::<SortExec>()
|| plan.as_any().is::<WindowedSortExec>()
{
partition_ranges = None;
}
if let Some(region_scan_exec) = plan.as_any().downcast_ref::<RegionScanExec>() {
partition_ranges = Some(region_scan_exec.get_uncollapsed_partition_ranges());
time_index = region_scan_exec.time_index();
// set distinguish_partition_ranges to true, this is an incorrect workaround
region_scan_exec.with_distinguish_partition_range(true);
}
Ok(Transformed::no(plan))
})?;
let result = try {
ScannerInfo {
partition_ranges: partition_ranges?,
time_index: time_index?,
}
};
Ok(result)
}

View File

@@ -17,6 +17,7 @@ use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};
use arrow::array::ArrayRef;
use arrow::compute::{concat, take_record_batch};
use arrow_schema::SchemaRef;
use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream};
@@ -24,7 +25,7 @@ use datafusion::common::arrow::compute::sort_to_indices;
use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
use datafusion::execution::{RecordBatchStream, TaskContext};
use datafusion::physical_plan::coalesce_batches::concat_batches;
use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
use datafusion::physical_plan::{
DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
};
@@ -33,8 +34,9 @@ use datafusion_physical_expr::PhysicalSortExpr;
use futures::Stream;
use itertools::Itertools;
use snafu::location;
use store_api::region_engine::PartitionRange;
use crate::error::Result;
use crate::downcast_ts_array;
/// Sort input within given PartitionRange
///
@@ -48,11 +50,16 @@ pub struct PartSortExec {
input: Arc<dyn ExecutionPlan>,
/// Execution metrics
metrics: ExecutionPlanMetricsSet,
partition_ranges: Vec<Vec<PartitionRange>>,
properties: PlanProperties,
}
impl PartSortExec {
pub fn try_new(expression: PhysicalSortExpr, input: Arc<dyn ExecutionPlan>) -> Result<Self> {
pub fn new(
expression: PhysicalSortExpr,
partition_ranges: Vec<Vec<PartitionRange>>,
input: Arc<dyn ExecutionPlan>,
) -> Self {
let metrics = ExecutionPlanMetricsSet::new();
let properties = PlanProperties::new(
input.equivalence_properties().clone(),
@@ -60,12 +67,13 @@ impl PartSortExec {
input.execution_mode(),
);
Ok(Self {
Self {
expression,
input,
metrics,
partition_ranges,
properties,
})
}
}
pub fn to_stream(
@@ -76,7 +84,21 @@ impl PartSortExec {
let input_stream: DfSendableRecordBatchStream =
self.input.execute(partition, context.clone())?;
let df_stream = Box::pin(PartSortStream::new(context, self, input_stream)) as _;
if partition >= self.partition_ranges.len() {
internal_err!(
"Partition index out of range: {} >= {}",
partition,
self.partition_ranges.len()
)?;
}
let df_stream = Box::pin(PartSortStream::new(
context,
self,
input_stream,
self.partition_ranges[partition].clone(),
partition,
)) as _;
Ok(df_stream)
}
@@ -114,10 +136,11 @@ impl ExecutionPlan for PartSortExec {
} else {
internal_err!("No children found")?
};
Ok(Arc::new(Self::try_new(
Ok(Arc::new(Self::new(
self.expression.clone(),
self.partition_ranges.clone(),
new_input.clone(),
)?))
)))
}
fn execute(
@@ -131,6 +154,15 @@ impl ExecutionPlan for PartSortExec {
fn metrics(&self) -> Option<MetricsSet> {
Some(self.metrics.clone_inner())
}
/// # Explain
///
/// This plan needs to be executed on each partition independently,
/// and is expected to run directly on storage engine's output
/// distribution / partition.
fn benefits_from_input_partitioning(&self) -> Vec<bool> {
vec![false]
}
}
struct PartSortStream {
@@ -142,6 +174,10 @@ struct PartSortStream {
input: DfSendableRecordBatchStream,
input_complete: bool,
schema: SchemaRef,
partition_ranges: Vec<PartitionRange>,
partition: usize,
cur_part_idx: usize,
metrics: BaselineMetrics,
}
impl PartSortStream {
@@ -149,6 +185,8 @@ impl PartSortStream {
context: Arc<TaskContext>,
sort: &PartSortExec,
input: DfSendableRecordBatchStream,
partition_ranges: Vec<PartitionRange>,
partition: usize,
) -> Self {
Self {
reservation: MemoryConsumer::new("PartSortStream".to_string())
@@ -159,17 +197,85 @@ impl PartSortStream {
input,
input_complete: false,
schema: sort.input.schema(),
partition_ranges,
partition,
cur_part_idx: 0,
metrics: BaselineMetrics::new(&sort.metrics, partition),
}
}
}
macro_rules! array_check_helper {
($t:ty, $unit:expr, $arr:expr, $cur_range:expr, $min_max_idx:expr) => {{
if $cur_range.start.unit().as_arrow_time_unit() != $unit
|| $cur_range.end.unit().as_arrow_time_unit() != $unit
{
internal_err!(
"PartitionRange unit mismatch, expect {:?}, found {:?}",
$cur_range.start.unit(),
$unit
)?;
}
let arr = $arr
.as_any()
.downcast_ref::<arrow::array::PrimitiveArray<$t>>()
.unwrap();
let min = arr.value($min_max_idx.0);
let max = arr.value($min_max_idx.1);
let (min, max) = if min < max{
(min, max)
} else {
(max, min)
};
let cur_min = $cur_range.start.value();
let cur_max = $cur_range.end.value();
// note that PartitionRange is left inclusive and right exclusive
if !(min >= cur_min && max < cur_max) {
internal_err!(
"Sort column min/max value out of partition range: sort_column.min_max=[{:?}, {:?}] not in PartitionRange=[{:?}, {:?}]",
min,
max,
cur_min,
cur_max
)?;
}
}};
}
impl PartSortStream {
/// check whether the sort column's min/max value is within the partition range
fn check_in_range(
&self,
sort_column: &ArrayRef,
min_max_idx: (usize, usize),
) -> datafusion_common::Result<()> {
if self.cur_part_idx >= self.partition_ranges.len() {
internal_err!(
"Partition index out of range: {} >= {}",
self.cur_part_idx,
self.partition_ranges.len()
)?;
}
let cur_range = self.partition_ranges[self.cur_part_idx];
downcast_ts_array!(
sort_column.data_type() => (array_check_helper, sort_column, cur_range, min_max_idx),
_ => internal_err!(
"Unsupported data type for sort column: {:?}",
sort_column.data_type()
)?,
);
Ok(())
}
/// Sort and clear the buffer and return the sorted record batch
///
/// this function should return None if RecordBatch is empty
fn sort_buffer(&mut self) -> datafusion_common::Result<Option<DfRecordBatch>> {
if self.buffer.iter().map(|r| r.num_rows()).sum::<usize>() == 0 {
return Ok(None);
/// this function should return a empty record batch if the buffer is empty
fn sort_buffer(&mut self) -> datafusion_common::Result<DfRecordBatch> {
if self.buffer.is_empty() {
return Ok(DfRecordBatch::new_empty(self.schema.clone()));
}
let mut sort_columns = Vec::with_capacity(self.buffer.len());
let mut opt = None;
@@ -194,6 +300,24 @@ impl PartSortStream {
)
})?;
self.check_in_range(
&sort_column,
(
indices.value(0) as usize,
indices.value(indices.len() - 1) as usize,
),
)
.inspect_err(|_e| {
#[cfg(debug_assertions)]
common_telemetry::error!(
"Fail to check sort column in range at {}, current_idx: {}, num_rows: {}, err: {}",
self.partition,
self.cur_part_idx,
sort_column.len(),
_e
);
})?;
// reserve memory for the concat input and sorted output
let total_mem: usize = self.buffer.iter().map(|r| r.get_array_memory_size()).sum();
self.reservation.try_grow(total_mem * 2)?;
@@ -229,7 +353,7 @@ impl PartSortStream {
drop(full_input);
// here remove both buffer and full_input memory
self.reservation.shrink(2 * total_mem);
Ok(Some(sorted))
Ok(sorted)
}
pub fn poll_next_inner(
@@ -241,7 +365,7 @@ impl PartSortStream {
if self.buffer.is_empty() {
return Poll::Ready(None);
} else {
return Poll::Ready(self.sort_buffer().transpose());
return Poll::Ready(Some(self.sort_buffer()));
}
}
let res = self.input.as_mut().poll_next(cx);
@@ -249,7 +373,13 @@ impl PartSortStream {
Poll::Ready(Some(Ok(batch))) => {
if batch.num_rows() == 0 {
// mark end of current PartitionRange
return Poll::Ready(self.sort_buffer().transpose());
let sorted_batch = self.sort_buffer()?;
self.cur_part_idx += 1;
if sorted_batch.num_rows() == 0 {
// Current part is empty, continue polling next part.
continue;
}
return Poll::Ready(Some(Ok(sorted_batch)));
}
self.buffer.push(batch);
// keep polling until boundary(a empty RecordBatch) is reached
@@ -271,10 +401,11 @@ impl Stream for PartSortStream {
type Item = datafusion_common::Result<DfRecordBatch>;
fn poll_next(
self: Pin<&mut Self>,
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
self.poll_next_inner(cx)
let result = self.as_mut().poll_next_inner(cx);
self.metrics.record_poll(result)
}
}
@@ -290,7 +421,6 @@ mod test {
use arrow::json::ArrayWriter;
use arrow_schema::{DataType, Field, Schema, SortOptions, TimeUnit};
use common_telemetry::error;
use common_time::Timestamp;
use datafusion_physical_expr::expressions::Column;
use futures::StreamExt;
@@ -311,6 +441,8 @@ mod test {
let mut rng = fastrand::Rng::new();
rng.seed(1337);
let mut test_cases = Vec::new();
for case_id in 0..test_cnt {
let mut bound_val: Option<i64> = None;
let descending = rng.bool();
@@ -359,21 +491,23 @@ mod test {
};
assert!(start < end);
let mut sort_data = vec![];
let mut per_part_sort_data = vec![];
let mut batches = vec![];
for _batch_idx in 0..rng.usize(1..batch_cnt_bound) {
let cnt = rng.usize(0..batch_size_bound) + 2;
let iter = 0..rng.usize(1..cnt);
let cnt = rng.usize(0..batch_size_bound) + 1;
let iter = 0..rng.usize(0..cnt);
let data_gen = iter
.map(|_| rng.i64(start.value()..end.value()))
.collect_vec();
sort_data.extend(data_gen.clone());
if data_gen.is_empty() {
// current batch is empty, skip
continue;
}
per_part_sort_data.extend(data_gen.clone());
let arr = new_ts_array(unit.clone(), data_gen.clone());
let batch = DfRecordBatch::try_new(schema.clone(), vec![arr]).unwrap();
batches.push(batch);
}
assert!(batches.iter().all(|i| i.num_rows() >= 1));
let range = PartitionRange {
start,
@@ -384,12 +518,14 @@ mod test {
input_ranged_data.push((range, batches));
if descending {
sort_data.sort_by(|a, b| b.cmp(a));
per_part_sort_data.sort_by(|a, b| b.cmp(a));
} else {
sort_data.sort();
per_part_sort_data.sort();
}
output_data.push(sort_data);
if per_part_sort_data.is_empty() {
continue;
}
output_data.push(per_part_sort_data);
}
let expected_output = output_data
@@ -399,8 +535,17 @@ mod test {
.unwrap()
})
.collect_vec();
test_cases.push((
case_id,
unit,
input_ranged_data,
schema,
opt,
expected_output,
));
}
assert!(!expected_output.is_empty());
for (case_id, _unit, input_ranged_data, schema, opt, expected_output) in test_cases {
run_test(case_id, input_ranged_data, schema, opt, expected_output).await;
}
}
@@ -412,25 +557,50 @@ mod test {
TimeUnit::Millisecond,
vec![
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8, 9]]),
((5, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
((5, 10), vec![vec![5, 6], vec![7, 8]]),
],
false,
vec![
vec![1, 2, 3, 4, 5, 6, 7, 8, 9],
vec![1, 2, 3, 4, 5, 6, 7, 8],
],
vec![vec![1, 2, 3, 4, 5, 6, 7, 8, 9], vec![5, 6, 7, 8]],
),
(
TimeUnit::Millisecond,
vec![
((5, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8, 9]]),
((5, 10), vec![vec![5, 6], vec![7, 8, 9]]),
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
],
true,
vec![vec![9, 8, 7, 6, 5], vec![8, 7, 6, 5, 4, 3, 2, 1]],
),
(
TimeUnit::Millisecond,
vec![
vec![9, 8, 7, 6, 5, 4, 3, 2, 1],
vec![8, 7, 6, 5, 4, 3, 2, 1],
((5, 10), vec![]),
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
],
true,
vec![vec![8, 7, 6, 5, 4, 3, 2, 1]],
),
(
TimeUnit::Millisecond,
vec![
((15, 20), vec![vec![17, 18, 19]]),
((10, 15), vec![]),
((5, 10), vec![]),
((0, 10), vec![vec![1, 2, 3], vec![4, 5, 6], vec![7, 8]]),
],
true,
vec![vec![19, 18, 17], vec![8, 7, 6, 5, 4, 3, 2, 1]],
),
(
TimeUnit::Millisecond,
vec![
((15, 20), vec![]),
((10, 15), vec![]),
((5, 10), vec![]),
((0, 10), vec![]),
],
true,
vec![],
),
];
@@ -487,7 +657,7 @@ mod test {
opt: SortOptions,
expected_output: Vec<DfRecordBatch>,
) {
let (_ranges, batches): (Vec<_>, Vec<_>) = input_ranged_data.clone().into_iter().unzip();
let (ranges, batches): (Vec<_>, Vec<_>) = input_ranged_data.clone().into_iter().unzip();
let batches = batches
.into_iter()
@@ -498,14 +668,14 @@ mod test {
.collect_vec();
let mock_input = MockInputExec::new(batches, schema.clone());
let exec = PartSortExec::try_new(
let exec = PartSortExec::new(
PhysicalSortExpr {
expr: Arc::new(Column::new("ts", 0)),
options: opt,
},
vec![ranges],
Arc::new(mock_input),
)
.unwrap();
);
let exec_stream = exec.execute(0, Arc::new(TaskContext::default())).unwrap();
@@ -513,6 +683,7 @@ mod test {
// a makeshift solution for compare large data
if real_output != expected_output {
let mut full_msg = String::new();
{
let mut buf = Vec::with_capacity(10 * real_output.len());
for batch in &real_output {
@@ -523,8 +694,9 @@ mod test {
buf.append(&mut rb_json);
buf.push(b',');
}
let buf = String::from_utf8_lossy(&buf);
error!("case_id:{case_id}, real_output: [{buf}]");
// TODO(discord9): better ways to print buf
let _buf = String::from_utf8_lossy(&buf);
full_msg += &format!("case_id:{case_id}, real_output");
}
{
let mut buf = Vec::with_capacity(10 * real_output.len());
@@ -536,10 +708,13 @@ mod test {
buf.append(&mut rb_json);
buf.push(b',');
}
let buf = String::from_utf8_lossy(&buf);
error!("case_id:{case_id}, expected_output: [{buf}]");
let _buf = String::from_utf8_lossy(&buf);
full_msg += &format!("case_id:{case_id}, expected_output");
}
panic!("case_{} failed, opt: {:?}", case_id, opt);
panic!(
"case_{} failed, opt: {:?}, full msg: {}",
case_id, opt, full_msg
);
}
}
}

View File

@@ -49,6 +49,7 @@ use crate::optimizer::remove_duplicate::RemoveDuplicate;
use crate::optimizer::scan_hint::ScanHintRule;
use crate::optimizer::string_normalization::StringNormalizationRule;
use crate::optimizer::type_conversion::TypeConversionRule;
use crate::optimizer::windowed_sort::WindowedSortPhysicalRule;
use crate::optimizer::ExtensionAnalyzerRule;
use crate::query_engine::options::QueryOptions;
use crate::query_engine::DefaultSerializer;
@@ -120,6 +121,10 @@ impl QueryEngineState {
physical_optimizer
.rules
.insert(0, Arc::new(ParallelizeScan));
// Add rule for windowed sort
physical_optimizer
.rules
.push(Arc::new(WindowedSortPhysicalRule));
// Add rule to remove duplicate nodes generated by other rules. Run this in the last.
physical_optimizer.rules.push(Arc::new(RemoveDuplicate));

View File

@@ -21,10 +21,6 @@ use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};
use arrow::array::types::{
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType,
};
use arrow::array::{Array, ArrayRef, PrimitiveArray};
use arrow::compute::SortColumn;
use arrow_schema::{DataType, SchemaRef, SortOptions};
@@ -58,30 +54,57 @@ use crate::error::{QueryExecutionSnafu, Result};
///
/// internally, it call [`streaming_merge`] multiple times to merge multiple sorted "working ranges"
///
/// the input stream must be concated in the order of `PartitionRange` in `ranges`(and each `PartitionRange` is sorted within itself), or else the result will be incorrect
/// # Invariant Promise on Input Stream
/// 1. The input stream must be sorted by timestamp and
/// 2. in the order of `PartitionRange` in `ranges`
/// 3. Each `PartitionRange` is sorted within itself(ascending or descending) but no need to be sorted across ranges
/// 4. There can't be any RecordBatch that is cross multiple `PartitionRange` in the input stream
///
/// TODO(discord9): fix item 4, but since only use `PartSort` as input, this might not be a problem
#[derive(Debug, Clone)]
pub struct WindowedSortExec {
/// Physical sort expressions(that is, sort by timestamp)
expression: PhysicalSortExpr,
/// Optional number of rows to fetch. Stops producing rows after this fetch
fetch: Option<usize>,
/// The input ranges indicate input stream will be composed of those ranges in given order
ranges: Vec<PartitionRange>,
/// The input ranges indicate input stream will be composed of those ranges in given order.
///
/// Each partition has one vector of `PartitionRange`.
ranges: Vec<Vec<PartitionRange>>,
/// All available working ranges and their corresponding working set
///
/// working ranges promise once input stream get a value out of current range, future values will never be in this range
all_avail_working_range: Vec<(TimeRange, BTreeSet<usize>)>,
/// working ranges promise once input stream get a value out of current range, future values will never
/// be in this range. Each partition has one vector of ranges.
all_avail_working_range: Vec<Vec<(TimeRange, BTreeSet<usize>)>>,
input: Arc<dyn ExecutionPlan>,
/// Execution metrics
metrics: ExecutionPlanMetricsSet,
properties: PlanProperties,
}
fn check_partition_range_monotonicity(ranges: &[PartitionRange], descending: bool) -> bool {
fn check_partition_range_monotonicity(
ranges: &[Vec<PartitionRange>],
descending: bool,
) -> Result<()> {
let is_valid = ranges.iter().all(|r| {
if descending {
ranges.windows(2).all(|w| w[0].end >= w[1].end)
r.windows(2).all(|w| w[0].end >= w[1].end)
} else {
ranges.windows(2).all(|w| w[0].start <= w[1].start)
r.windows(2).all(|w| w[0].start <= w[1].start)
}
});
if !is_valid {
let msg = if descending {
"Input `PartitionRange`s's upper bound is not monotonic non-increase"
} else {
"Input `PartitionRange`s's lower bound is not monotonic non-decrease"
};
let plain_error = PlainError::new(msg.to_string(), StatusCode::Unexpected);
Err(BoxedError::new(plain_error)).context(QueryExecutionSnafu {})
} else {
Ok(())
}
}
@@ -89,28 +112,28 @@ impl WindowedSortExec {
pub fn try_new(
expression: PhysicalSortExpr,
fetch: Option<usize>,
ranges: Vec<PartitionRange>,
ranges: Vec<Vec<PartitionRange>>,
input: Arc<dyn ExecutionPlan>,
) -> Result<Self> {
if !check_partition_range_monotonicity(&ranges, expression.options.descending) {
let msg = if expression.options.descending {
"Input `PartitionRange`s's upper bound is not monotonic non-increase"
} else {
"Input `PartitionRange`s's lower bound is not monotonic non-decrease"
};
let plain_error = PlainError::new(msg.to_string(), StatusCode::Unexpected);
return Err(BoxedError::new(plain_error)).context(QueryExecutionSnafu {});
}
check_partition_range_monotonicity(&ranges, expression.options.descending)?;
let properties = PlanProperties::new(
input.equivalence_properties().clone(),
input
.equivalence_properties()
.clone()
.with_reorder(vec![expression.clone()]),
input.output_partitioning().clone(),
input.execution_mode(),
);
let overlap_counts = split_overlapping_ranges(&ranges);
let all_avail_working_range =
let mut all_avail_working_range = Vec::with_capacity(ranges.len());
for r in &ranges {
let overlap_counts = split_overlapping_ranges(r);
let working_ranges =
compute_all_working_ranges(&overlap_counts, expression.options.descending);
all_avail_working_range.push(working_ranges);
}
Ok(Self {
expression,
fetch,
@@ -123,7 +146,8 @@ impl WindowedSortExec {
}
/// During receiving partial-sorted RecordBatch, we need to update the working set which is the
/// `PartitionRange` we think those RecordBatch belongs to. And when we receive something outside of working set, we can merge results before whenever possible.
/// `PartitionRange` we think those RecordBatch belongs to. And when we receive something outside
/// of working set, we can merge results before whenever possible.
pub fn to_stream(
&self,
context: Arc<TaskContext>,
@@ -132,7 +156,12 @@ impl WindowedSortExec {
let input_stream: DfSendableRecordBatchStream =
self.input.execute(partition, context.clone())?;
let df_stream = Box::pin(WindowedSortStream::new(context, self, input_stream)) as _;
let df_stream = Box::pin(WindowedSortStream::new(
context,
self,
input_stream,
partition,
)) as _;
Ok(df_stream)
}
@@ -190,6 +219,15 @@ impl ExecutionPlan for WindowedSortExec {
fn metrics(&self) -> Option<MetricsSet> {
Some(self.metrics.clone_inner())
}
/// # Explain
///
/// This plan needs to be executed on each partition independently,
/// and is expected to run directly on storage engine's output
/// distribution / partition.
fn benefits_from_input_partitioning(&self) -> Vec<bool> {
vec![false; self.ranges.len()]
}
}
/// The core logic of merging sort multiple sorted ranges
@@ -215,8 +253,8 @@ pub struct WindowedSortStream {
working_idx: usize,
/// input stream assumed reading in order of `PartitionRange`
input: DfSendableRecordBatchStream,
/// Whether the input stream is terminated, if it did, we should poll input again
is_input_terminated: bool,
/// Whether this stream is terminated. For reasons like limit reached or input stream is done.
is_terminated: bool,
/// Output Schema, which is the same as input schema, since this is a sort plan
schema: SchemaRef,
/// Physical sort expressions(that is, sort by timestamp)
@@ -231,8 +269,10 @@ pub struct WindowedSortStream {
///
/// working ranges promise once input stream get a value out of current range, future values will never be in this range
all_avail_working_range: Vec<(TimeRange, BTreeSet<usize>)>,
/// The input partition ranges
ranges: Vec<PartitionRange>,
/// Execution metrics
metrics: ExecutionPlanMetricsSet,
metrics: BaselineMetrics,
}
impl WindowedSortStream {
@@ -240,6 +280,7 @@ impl WindowedSortStream {
context: Arc<TaskContext>,
exec: &WindowedSortExec,
input: DfSendableRecordBatchStream,
partition: usize,
) -> Self {
Self {
memory_pool: context.runtime_env().memory_pool.clone(),
@@ -251,18 +292,79 @@ impl WindowedSortStream {
working_idx: 0,
schema: input.schema(),
input,
is_input_terminated: false,
is_terminated: false,
expression: exec.expression.clone(),
fetch: exec.fetch,
produced: 0,
batch_size: context.session_config().batch_size(),
all_avail_working_range: exec.all_avail_working_range.clone(),
metrics: exec.metrics.clone(),
all_avail_working_range: exec.all_avail_working_range[partition].clone(),
ranges: exec.ranges[partition].clone(),
metrics: BaselineMetrics::new(&exec.metrics, partition),
}
}
}
impl WindowedSortStream {
#[cfg(debug_assertions)]
fn check_subset_ranges(&self, cur_range: &TimeRange) {
let cur_is_subset_to = self
.ranges
.iter()
.filter(|r| cur_range.is_subset(&TimeRange::from(*r)))
.collect_vec();
if cur_is_subset_to.is_empty() {
error!("Current range is not a subset of any PartitionRange");
// found in all ranges that are subset of current range
let subset_ranges = self
.ranges
.iter()
.filter(|r| TimeRange::from(*r).is_subset(cur_range))
.collect_vec();
let only_overlap = self
.ranges
.iter()
.filter(|r| {
let r = TimeRange::from(*r);
r.is_overlapping(cur_range) && !r.is_subset(cur_range)
})
.collect_vec();
error!(
"Bad input, found {} ranges that are subset of current range, also found {} ranges that only overlap, subset ranges are: {:?}; overlap ranges are: {:?}",
subset_ranges.len(),
only_overlap.len(),
subset_ranges,
only_overlap
);
} else {
let only_overlap = self
.ranges
.iter()
.filter(|r| {
let r = TimeRange::from(*r);
r.is_overlapping(cur_range) && !cur_range.is_subset(&r)
})
.collect_vec();
error!(
"Found current range to be subset of {} ranges, also found {} ranges that only overlap, of subset ranges are:{:?}; overlap ranges are: {:?}",
cur_is_subset_to.len(),
only_overlap.len(),
cur_is_subset_to,
only_overlap
);
}
let all_overlap_working_range = self
.all_avail_working_range
.iter()
.filter(|(range, _)| range.is_overlapping(cur_range))
.map(|(range, _)| range)
.collect_vec();
error!(
"Found {} working ranges that overlap with current range: {:?}",
all_overlap_working_range.len(),
all_overlap_working_range
);
}
/// Poll the next RecordBatch from the merge-sort's output stream
fn poll_result_stream(
&mut self,
@@ -273,6 +375,7 @@ impl WindowedSortStream {
Poll::Ready(Some(Ok(batch))) => {
let ret = if let Some(remaining) = self.remaining_fetch() {
if remaining == 0 {
self.is_terminated = true;
None
} else if remaining < batch.num_rows() {
self.produced += remaining;
@@ -304,6 +407,7 @@ impl WindowedSortStream {
// if no output stream is available
Poll::Ready(None)
}
/// The core logic of merging sort multiple sorted ranges
///
/// We try to maximize the number of sorted runs we can merge in one go, while emit the result as soon as possible.
@@ -314,7 +418,7 @@ impl WindowedSortStream {
// first check and send out the merge result
match self.poll_result_stream(cx) {
Poll::Ready(None) => {
if self.is_input_terminated {
if self.is_terminated {
return Poll::Ready(None);
}
}
@@ -322,7 +426,7 @@ impl WindowedSortStream {
};
// consume input stream
while !self.is_input_terminated {
while !self.is_terminated {
// then we get a new RecordBatch from input stream
let new_input_rbs = match self.input.as_mut().poll_next(cx) {
Poll::Ready(Some(Ok(batch))) => {
@@ -333,70 +437,66 @@ impl WindowedSortStream {
}
Poll::Ready(None) => {
// input stream is done, we need to merge sort the remaining working set
self.is_input_terminated = true;
self.is_terminated = true;
None
}
Poll::Pending => return Poll::Pending,
};
// The core logic to eargerly merge sort the working set
match new_input_rbs {
Some(SortedRunSet {
let Some(SortedRunSet {
runs_with_batch,
sort_column,
}) => {
}) = new_input_rbs
else {
// input stream is done, we need to merge sort the remaining working set
self.build_sorted_stream()?;
self.start_new_merge_sort()?;
continue;
};
// The core logic to eargerly merge sort the working set
// compare with last_value to find boundary, then merge runs if needed
// iterate over runs_with_batch to merge sort, might create zero or more stream to put to `sort_partition_rbs`
let mut last_remaining = None;
let mut run_iter = runs_with_batch.into_iter();
loop {
let (sorted_rb, run_info) = if let Some(r) = last_remaining.take() {
r
} else if let Some(r) = run_iter.next() {
r
} else {
let Some((sorted_rb, run_info)) = last_remaining.take().or(run_iter.next()) else {
break;
};
if sorted_rb.num_rows() == 0 {
continue;
}
// determine if this batch is in current working range
let cur_range = {
match run_info.get_time_range() {
Some(r) => r,
_ => internal_err!("Found NULL in time index column")?,
}
let Some(cur_range) = run_info.get_time_range() else {
internal_err!("Found NULL in time index column")?
};
let working_range = if let Some(r) = self.get_working_range() {
r
} else {
let Some(working_range) = self.get_working_range() else {
internal_err!("No working range found")?
};
// ensure the current batch is in the working range
if sort_column.options.unwrap_or_default().descending {
if cur_range.end > working_range.end {
error!("Invalid range: {:?} > {:?}", cur_range, working_range);
#[cfg(debug_assertions)]
self.check_subset_ranges(&cur_range);
internal_err!("Current batch have data on the right side of working range, something is very wrong")?;
}
} else if cur_range.start < working_range.start {
error!("Invalid range: {:?} < {:?}", cur_range, working_range);
#[cfg(debug_assertions)]
self.check_subset_ranges(&cur_range);
internal_err!("Current batch have data on the left side of working range, something is very wrong")?;
}
if cur_range.is_subset(&working_range) {
// data still in range, can't merge sort yet
// see if can concat entire sorted rb, merge sort need to wait
self.try_concat_batch(
sorted_rb.clone(),
&run_info,
sort_column.options,
)?;
self.try_concat_batch(sorted_rb.clone(), &run_info, sort_column.options)?;
} else if let Some(intersection) = cur_range.intersection(&working_range) {
// slice rb by intersection and concat it then merge sort
let cur_sort_column =
sort_column.values.slice(run_info.offset, run_info.len);
let cur_sort_column = sort_column.values.slice(run_info.offset, run_info.len);
let (offset, len) = find_slice_from_range(
&SortColumn {
values: cur_sort_column.clone(),
@@ -419,13 +519,11 @@ impl WindowedSortStream {
// since we are crossing working range, we need to merge sort the working set
self.start_new_merge_sort()?;
let (r_offset, r_len) =
(offset + len, sorted_rb.num_rows() - offset - len);
let (r_offset, r_len) = (offset + len, sorted_rb.num_rows() - offset - len);
if r_len != 0 {
// we have remaining data in this batch, put it back to input queue
let remaining_rb = sorted_rb.slice(r_offset, r_len);
let new_first_val =
get_timestamp_from_idx(&cur_sort_column, r_offset)?;
let new_first_val = get_timestamp_from_idx(&cur_sort_column, r_offset)?;
let new_run_info = SucRun {
offset: run_info.offset + r_offset,
len: r_len,
@@ -450,14 +548,6 @@ impl WindowedSortStream {
}
}
}
None => {
// input stream is done, we need to merge sort the remaining working set
// and emit the result
self.build_sorted_stream()?;
self.start_new_merge_sort()?;
}
}
}
// emit the merge result
self.poll_result_stream(cx)
}
@@ -540,7 +630,7 @@ impl WindowedSortStream {
streams,
self.schema(),
&[self.expression.clone()],
BaselineMetrics::new(&self.metrics, 0),
self.metrics.clone(),
self.batch_size,
fetch,
reservation,
@@ -562,10 +652,11 @@ impl Stream for WindowedSortStream {
type Item = datafusion_common::Result<DfRecordBatch>;
fn poll_next(
self: Pin<&mut Self>,
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
self.poll_next_inner(cx)
let result = self.as_mut().poll_next_inner(cx);
self.metrics.record_poll(result)
}
}
@@ -624,21 +715,22 @@ fn split_batch_to_sorted_run(
/// Downcast a temporal array to a specific type
///
/// usage similar to `downcast_primitive!` in `arrow-array` crate
#[macro_export]
macro_rules! downcast_ts_array {
($data_type:expr => ($m:path $(, $args:tt)*), $($p:pat => $fallback:expr $(,)*)*) =>
{
match $data_type {
arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => {
$m!(TimestampSecondType $(, $args)*)
$m!(arrow::datatypes::TimestampSecondType, arrow_schema::TimeUnit::Second $(, $args)*)
}
arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => {
$m!(TimestampMillisecondType $(, $args)*)
$m!(arrow::datatypes::TimestampMillisecondType, arrow_schema::TimeUnit::Millisecond $(, $args)*)
}
arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => {
$m!(TimestampMicrosecondType $(, $args)*)
$m!(arrow::datatypes::TimestampMicrosecondType, arrow_schema::TimeUnit::Microsecond $(, $args)*)
}
arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => {
$m!(TimestampNanosecondType $(, $args)*)
$m!(arrow::datatypes::TimestampNanosecondType, arrow_schema::TimeUnit::Nanosecond $(, $args)*)
}
$($p => $fallback,)*
}
@@ -711,7 +803,7 @@ fn find_slice_from_range(
}
macro_rules! array_iter_helper {
($t:ty, $arr:expr) => {{
($t:ty, $unit:expr, $arr:expr) => {{
let typed = $arr.as_any().downcast_ref::<PrimitiveArray<$t>>().unwrap();
let iter = typed.iter().enumerate();
Box::new(iter) as Box<dyn Iterator<Item = (usize, Option<i64>)>>
@@ -993,7 +1085,8 @@ fn compute_all_working_ranges(
match &mut cur_range_set {
None => cur_range_set = Some((*range, BTreeSet::from_iter(set.iter().cloned()))),
Some((working_range, working_set)) => {
// if next overlap range have Partition that's is not last one in `working_set`(hence need to be read before merge sorting), and `working_set` have >1 count
// if next overlap range have Partition that's is not last one in `working_set`(hence need
// to be read before merge sorting), and `working_set` have >1 count
// we have to expand current working range to cover it(and add it's `set` to `working_set`)
// so that merge sort is possible
let need_expand = {
@@ -2410,7 +2503,7 @@ mod test {
let exec = WindowedSortExec::try_new(
self.expression.clone(),
self.fetch,
ranges,
vec![ranges],
Arc::new(mock_input),
)
.unwrap();
@@ -2441,6 +2534,78 @@ mod test {
vec![],
vec![],
),
TestStream::new(
Column::new("ts", 0),
SortOptions {
descending: false,
nulls_first: true,
},
None,
vec![Field::new(
"ts",
DataType::Timestamp(TimeUnit::Millisecond, None),
false,
)],
vec![
// test one empty
(
PartitionRange {
start: Timestamp::new_millisecond(1),
end: Timestamp::new_millisecond(2),
num_rows: 1,
identifier: 0,
},
vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
),
(
PartitionRange {
start: Timestamp::new_millisecond(1),
end: Timestamp::new_millisecond(3),
num_rows: 1,
identifier: 0,
},
vec![Arc::new(TimestampMillisecondArray::from_iter_values([2]))],
),
],
vec![vec![Arc::new(TimestampMillisecondArray::from_iter_values(
[2],
))]],
),
TestStream::new(
Column::new("ts", 0),
SortOptions {
descending: false,
nulls_first: true,
},
None,
vec![Field::new(
"ts",
DataType::Timestamp(TimeUnit::Millisecond, None),
false,
)],
vec![
// test one empty
(
PartitionRange {
start: Timestamp::new_millisecond(1),
end: Timestamp::new_millisecond(2),
num_rows: 1,
identifier: 0,
},
vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
),
(
PartitionRange {
start: Timestamp::new_millisecond(1),
end: Timestamp::new_millisecond(3),
num_rows: 1,
identifier: 0,
},
vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
),
],
vec![],
),
TestStream::new(
Column::new("ts", 0),
SortOptions {

View File

@@ -205,7 +205,7 @@ pub struct ScannerProperties {
total_rows: usize,
/// Whether to yield an empty batch to distinguish partition ranges.
distinguish_partition_range: bool,
pub distinguish_partition_range: bool,
}
impl ScannerProperties {
@@ -227,12 +227,6 @@ impl ScannerProperties {
self
}
/// Sets distinguish partition range for scanner.
pub fn with_distinguish_partition_range(mut self, distinguish_partition_range: bool) -> Self {
self.distinguish_partition_range = distinguish_partition_range;
self
}
/// Creates a new [`ScannerProperties`] with the given partitioning.
pub fn new(partitions: Vec<Vec<PartitionRange>>, append_mode: bool, total_rows: usize) -> Self {
Self {
@@ -274,7 +268,11 @@ pub trait RegionScanner: Debug + DisplayAs + Send {
/// Prepares the scanner with the given partition ranges.
///
/// This method is for the planner to adjust the scanner's behavior based on the partition ranges.
fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError>;
fn prepare(
&mut self,
ranges: Vec<Vec<PartitionRange>>,
distinguish_partition_range: bool,
) -> Result<(), BoxedError>;
/// Scans the partition and returns a stream of record batches.
///
@@ -441,8 +439,13 @@ impl RegionScanner for SinglePartitionScanner {
self.schema.clone()
}
fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
fn prepare(
&mut self,
ranges: Vec<Vec<PartitionRange>>,
distinguish_partition_range: bool,
) -> Result<(), BoxedError> {
self.properties.partitions = ranges;
self.properties.distinguish_partition_range = distinguish_partition_range;
Ok(())
}

View File

@@ -22,6 +22,7 @@ use common_error::ext::BoxedError;
use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream, SendableRecordBatchStream};
use common_telemetry::tracing::Span;
use common_telemetry::tracing_context::TracingContext;
use common_telemetry::warn;
use datafusion::error::Result as DfResult;
use datafusion::execution::context::TaskContext;
use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
@@ -49,6 +50,7 @@ pub struct RegionScanExec {
properties: PlanProperties,
append_mode: bool,
total_rows: usize,
is_partition_set: bool,
}
impl RegionScanExec {
@@ -77,15 +79,10 @@ impl RegionScanExec {
properties,
append_mode,
total_rows,
is_partition_set: false,
}
}
/// Set the expected output ordering for the plan.
pub fn with_output_ordering(mut self, output_ordering: Vec<PhysicalSortExpr>) -> Self {
self.output_ordering = Some(output_ordering);
self
}
/// Get the partition ranges of the scanner. This method will collapse the ranges into
/// a single vector.
pub fn get_partition_ranges(&self) -> Vec<PartitionRange> {
@@ -101,18 +98,33 @@ impl RegionScanExec {
ranges
}
/// Similar to [`Self::get_partition_ranges`] but don't collapse the ranges.
pub fn get_uncollapsed_partition_ranges(&self) -> Vec<Vec<PartitionRange>> {
let scanner = self.scanner.lock().unwrap();
scanner.properties().partitions.clone()
}
pub fn is_partition_set(&self) -> bool {
self.is_partition_set
}
/// Update the partition ranges of underlying scanner.
pub fn with_new_partitions(
&self,
partitions: Vec<Vec<PartitionRange>>,
) -> Result<Self, BoxedError> {
if self.is_partition_set {
warn!("Setting partition ranges more than once for RegionScanExec");
}
let num_partitions = partitions.len();
let mut properties = self.properties.clone();
properties.partitioning = Partitioning::UnknownPartitioning(num_partitions);
{
let mut scanner = self.scanner.lock().unwrap();
scanner.prepare(partitions)?;
let distinguish_partition_range = scanner.properties().distinguish_partition_range();
scanner.prepare(partitions, distinguish_partition_range)?;
}
Ok(Self {
@@ -123,8 +135,25 @@ impl RegionScanExec {
properties,
append_mode: self.append_mode,
total_rows: self.total_rows,
is_partition_set: true,
})
}
pub fn with_distinguish_partition_range(&self, distinguish_partition_range: bool) {
let mut scanner = self.scanner.lock().unwrap();
let partition_ranges = scanner.properties().partitions.clone();
// set distinguish_partition_range won't fail
let _ = scanner.prepare(partition_ranges, distinguish_partition_range);
}
pub fn time_index(&self) -> Option<String> {
self.scanner
.lock()
.unwrap()
.schema()
.timestamp_column()
.map(|x| x.name.clone())
}
}
impl ExecutionPlan for RegionScanExec {

View File

@@ -189,6 +189,7 @@ async fn validate_values(
"SELECT {} FROM {} ORDER BY {}",
column_list, logical_table_ctx.name, primary_keys_column_list
);
info!("Select SQL: {select_sql}");
let fetched_rows = validator::row::fetch_values(&ctx.greptime, select_sql.as_str()).await?;
let mut expected_rows =
replace_default(&insert_expr.values_list, &logical_table_ctx, insert_expr);
@@ -214,6 +215,8 @@ async fn insert_values<R: Rng + 'static>(
.await
.context(error::ExecuteQuerySnafu { sql: &sql })?;
info!("Insert values, result: {result:?}");
ensure!(
result.rows_affected() == rows as u64,
error::AssertSnafu {

View File

@@ -0,0 +1,81 @@
CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX);
Affected Rows: 0
INSERT INTO test VALUES (1, 1), (NULL, 2), (1, 3);
Affected Rows: 3
ADMIN FLUSH_TABLE('test');
+---------------------------+
| ADMIN FLUSH_TABLE('test') |
+---------------------------+
| 0 |
+---------------------------+
INSERT INTO test VALUES (2, 4), (2, 5), (NULL, 6);
Affected Rows: 3
ADMIN FLUSH_TABLE('test');
+---------------------------+
| ADMIN FLUSH_TABLE('test') |
+---------------------------+
| 0 |
+---------------------------+
INSERT INTO test VALUES (3, 7), (3, 8), (3, 9);
Affected Rows: 3
ADMIN FLUSH_TABLE('test');
+---------------------------+
| ADMIN FLUSH_TABLE('test') |
+---------------------------+
| 0 |
+---------------------------+
INSERT INTO test VALUES (4, 10), (4, 11), (4, 12);
Affected Rows: 3
SELECT * FROM test ORDER BY t LIMIT 5;
+---+-------------------------+
| i | t |
+---+-------------------------+
| 1 | 1970-01-01T00:00:00.001 |
| | 1970-01-01T00:00:00.002 |
| 1 | 1970-01-01T00:00:00.003 |
| 2 | 1970-01-01T00:00:00.004 |
| 2 | 1970-01-01T00:00:00.005 |
+---+-------------------------+
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE (metrics.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5;
+-+-+-+
| stage | node | plan_|
+-+-+-+
| 0_| 0_|_MergeScanExec: REDACTED
|_|_|_|
| 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
|_|_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST] REDACTED
|_|_|_WindowedSortExec REDACTED
|_|_|_PartSortExec t@1 ASC NULLS LAST REDACTED
|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
|_|_|_|
|_|_| Total rows: 5_|
+-+-+-+
DROP TABLE test;
Affected Rows: 0

View File

@@ -0,0 +1,26 @@
CREATE TABLE test(i INTEGER, t TIMESTAMP TIME INDEX);
INSERT INTO test VALUES (1, 1), (NULL, 2), (1, 3);
ADMIN FLUSH_TABLE('test');
INSERT INTO test VALUES (2, 4), (2, 5), (NULL, 6);
ADMIN FLUSH_TABLE('test');
INSERT INTO test VALUES (3, 7), (3, 8), (3, 9);
ADMIN FLUSH_TABLE('test');
INSERT INTO test VALUES (4, 10), (4, 11), (4, 12);
SELECT * FROM test ORDER BY t LIMIT 5;
-- SQLNESS REPLACE (-+) -
-- SQLNESS REPLACE (\s\s+) _
-- SQLNESS REPLACE (peers.*) REDACTED
-- SQLNESS REPLACE (metrics.*) REDACTED
-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5;
DROP TABLE test;

View File

@@ -183,6 +183,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
| physical_plan after LimitAggregation_| SAME TEXT AS ABOVE_|
| physical_plan after ProjectionPushdown_| SAME TEXT AS ABOVE_|
| physical_plan after PipelineChecker_| SAME TEXT AS ABOVE_|
| physical_plan after WindowedSortRule_| SAME TEXT AS ABOVE_|
| physical_plan after RemoveDuplicateRule_| SAME TEXT AS ABOVE_|
| physical_plan_| PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[j]_|
|_|_PromSeriesNormalizeExec: offset=[0], time index=[j], filter NaN: [false]_|