feat: optimizer rule for windowed sort (#4874)

* basic impl

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement physical rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* feat: install windowed sort physical rule and optimize partition ranges

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add logs and sqlness test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* feat: introduce PartSortExec for partitioned sorting

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* tune exec nodes' properties and metrics

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* debug: add more info on very wrong

* debug: also print overlap ranges

* feat: add check when emit PartSort Stream

* dbg: info on overlap working range

* feat: check batch range is inside part range

* set distinguish partition range param

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* chore: more logs

* update sqlness

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* tune optimizer

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix lints

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix windowed sort rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix: early terminate sort stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* chore: remove min/max check

* chore: remove unused windowed_sort module, uuid feature and refactor region_scanner to synchronous

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* chore: print more fuzz log

* chore: more log

* fix: part sort should skip empty part

* chore: remove insert logs

* tests: empty PartitionRange

* refactor: testcase

* docs: update comment&tests: all empty

* ci: enlarge etcd cpu limit

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: discord9 <discord9@163.com>
Co-authored-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
Ruihang Xia
2024-10-29 15:46:05 +08:00
committed by GitHub
parent 0ee455a980
commit 03f2fa219d
21 changed files with 930 additions and 230 deletions

View File

@@ -22,6 +22,7 @@ use common_error::ext::BoxedError;
use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream, SendableRecordBatchStream};
use common_telemetry::tracing::Span;
use common_telemetry::tracing_context::TracingContext;
use common_telemetry::warn;
use datafusion::error::Result as DfResult;
use datafusion::execution::context::TaskContext;
use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet};
@@ -49,6 +50,7 @@ pub struct RegionScanExec {
properties: PlanProperties,
append_mode: bool,
total_rows: usize,
is_partition_set: bool,
}
impl RegionScanExec {
@@ -77,15 +79,10 @@ impl RegionScanExec {
properties,
append_mode,
total_rows,
is_partition_set: false,
}
}
/// Set the expected output ordering for the plan.
pub fn with_output_ordering(mut self, output_ordering: Vec<PhysicalSortExpr>) -> Self {
self.output_ordering = Some(output_ordering);
self
}
/// Get the partition ranges of the scanner. This method will collapse the ranges into
/// a single vector.
pub fn get_partition_ranges(&self) -> Vec<PartitionRange> {
@@ -101,18 +98,33 @@ impl RegionScanExec {
ranges
}
/// Similar to [`Self::get_partition_ranges`] but don't collapse the ranges.
pub fn get_uncollapsed_partition_ranges(&self) -> Vec<Vec<PartitionRange>> {
let scanner = self.scanner.lock().unwrap();
scanner.properties().partitions.clone()
}
pub fn is_partition_set(&self) -> bool {
self.is_partition_set
}
/// Update the partition ranges of underlying scanner.
pub fn with_new_partitions(
&self,
partitions: Vec<Vec<PartitionRange>>,
) -> Result<Self, BoxedError> {
if self.is_partition_set {
warn!("Setting partition ranges more than once for RegionScanExec");
}
let num_partitions = partitions.len();
let mut properties = self.properties.clone();
properties.partitioning = Partitioning::UnknownPartitioning(num_partitions);
{
let mut scanner = self.scanner.lock().unwrap();
scanner.prepare(partitions)?;
let distinguish_partition_range = scanner.properties().distinguish_partition_range();
scanner.prepare(partitions, distinguish_partition_range)?;
}
Ok(Self {
@@ -123,8 +135,25 @@ impl RegionScanExec {
properties,
append_mode: self.append_mode,
total_rows: self.total_rows,
is_partition_set: true,
})
}
pub fn with_distinguish_partition_range(&self, distinguish_partition_range: bool) {
let mut scanner = self.scanner.lock().unwrap();
let partition_ranges = scanner.properties().partitions.clone();
// set distinguish_partition_range won't fail
let _ = scanner.prepare(partition_ranges, distinguish_partition_range);
}
pub fn time_index(&self) -> Option<String> {
self.scanner
.lock()
.unwrap()
.schema()
.timestamp_column()
.map(|x| x.name.clone())
}
}
impl ExecutionPlan for RegionScanExec {