feat: make RegionScanner aware of PartitionRange (#4170)

* define PartitionRange

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add optimizer rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement interfaces

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl aggr stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add fallback method

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix tests

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix tests

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update sqlness result

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add document and rename struct

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add more comments

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia
2024-06-21 17:54:22 +08:00
committed by GitHub
parent ac574b66ab
commit fce65c97e3
22 changed files with 615 additions and 127 deletions

View File

@@ -15,6 +15,7 @@ common-base.workspace = true
common-error.workspace = true
common-macro.workspace = true
common-recordbatch.workspace = true
common-time.workspace = true
common-wal.workspace = true
datafusion-expr.workspace = true
datafusion-physical-plan.workspace = true

View File

@@ -24,6 +24,7 @@ use async_trait::async_trait;
use common_error::ext::{BoxedError, PlainError};
use common_error::status_code::StatusCode;
use common_recordbatch::SendableRecordBatchStream;
use common_time::Timestamp;
use datafusion_physical_plan::{DisplayAs, DisplayFormatType};
use datatypes::schema::SchemaRef;
use futures::future::join_all;
@@ -141,42 +142,71 @@ impl ScannerPartitioning {
}
}
/// Represents one data range within a partition
#[derive(Debug, Clone, Copy)]
pub struct PartitionRange {
/// Start time of time index column. Inclusive.
pub start: Timestamp,
/// End time of time index column. Inclusive.
pub end: Timestamp,
/// Estimate size of this range. Is used to balance ranges between partitions.
/// No base unit, just a number.
pub estimated_size: usize,
/// Identifier to this range. Assigned by storage engine.
pub identifier: usize,
}
/// Properties of the [RegionScanner].
#[derive(Debug)]
pub struct ScannerProperties {
/// Partitions to scan.
partitioning: ScannerPartitioning,
/// A 2-dim partition ranges.
///
/// The first dim vector's length represents the output partition number. The second
/// dim is ranges within one partition.
pub partitions: Vec<Vec<PartitionRange>>,
}
impl ScannerProperties {
/// Creates a new [ScannerProperties] with the given partitioning.
pub fn new(partitioning: ScannerPartitioning) -> Self {
Self { partitioning }
/// Creates a new [`ScannerProperties`] with the given partitioning.
pub fn new(partitions: Vec<Vec<PartitionRange>>) -> Self {
Self { partitions }
}
/// Returns properties of partitions to scan.
pub fn partitioning(&self) -> &ScannerPartitioning {
&self.partitioning
/// Creates a new [`ScannerProperties`] with the given number of partitions.
pub fn new_with_partitions(partitions: usize) -> Self {
Self {
partitions: vec![vec![]; partitions],
}
}
pub fn num_partitions(&self) -> usize {
self.partitions.len()
}
}
/// A scanner that provides a way to scan the region concurrently.
/// The scanner splits the region into partitions so that each partition can be scanned concurrently.
/// You can use this trait to implement an [ExecutionPlan](datafusion_physical_plan::ExecutionPlan).
pub trait RegionScanner: Debug + DisplayAs + Send + Sync {
/// You can use this trait to implement an [`ExecutionPlan`](datafusion_physical_plan::ExecutionPlan).
pub trait RegionScanner: Debug + DisplayAs + Send {
/// Returns the properties of the scanner.
fn properties(&self) -> &ScannerProperties;
/// Returns the schema of the record batches.
fn schema(&self) -> SchemaRef;
/// Prepares the scanner with the given partition ranges.
///
/// This method is for the planner to adjust the scanner's behavior based on the partition ranges.
fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError>;
/// Scans the partition and returns a stream of record batches.
///
/// # Panics
/// Panics if the `partition` is out of bound.
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError>;
}
pub type RegionScannerRef = Arc<dyn RegionScanner>;
pub type RegionScannerRef = Box<dyn RegionScanner>;
pub type BatchResponses = Vec<(RegionId, Result<RegionResponse, BoxedError>)>;
@@ -272,7 +302,7 @@ impl SinglePartitionScanner {
Self {
stream: Mutex::new(Some(stream)),
schema,
properties: ScannerProperties::new(ScannerPartitioning::Unknown(1)),
properties: ScannerProperties::new_with_partitions(1),
}
}
}
@@ -292,6 +322,11 @@ impl RegionScanner for SinglePartitionScanner {
self.schema.clone()
}
fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
self.properties = ScannerProperties::new(ranges);
Ok(())
}
fn scan_partition(&self, _partition: usize) -> Result<SendableRecordBatchStream, BoxedError> {
let mut stream = self.stream.lock().unwrap();
stream.take().ok_or_else(|| {