feat: make RegionScanner aware of PartitionRange (#4170)

* define PartitionRange

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add optimizer rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement interfaces

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl aggr stream

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add fallback method

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix tests

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix tests

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update sqlness result

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add document and rename struct

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add more comments

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia
2024-06-21 17:54:22 +08:00
committed by GitHub
parent ac574b66ab
commit fce65c97e3
22 changed files with 615 additions and 127 deletions

View File

@@ -105,6 +105,10 @@ impl BoxedError {
inner: Box::new(err),
}
}
pub fn into_inner(self) -> Box<dyn crate::ext::ErrorExt + Send + Sync> {
self.inner
}
}
impl std::fmt::Debug for BoxedError {

View File

@@ -20,6 +20,7 @@ use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use datafusion_common::ScalarValue;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::SchemaRef;
use snafu::{Location, Snafu};
pub type Result<T> = std::result::Result<T, Error>;
@@ -138,12 +139,28 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Cannot construct an empty stream"))]
EmptyStream {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Schema not match, left: {:?}, right: {:?}", left, right))]
SchemaNotMatch {
left: SchemaRef,
right: SchemaRef,
#[snafu(implicit)]
location: Location,
},
}
impl ErrorExt for Error {
fn status_code(&self) -> StatusCode {
match self {
Error::NewDfRecordBatch { .. } => StatusCode::InvalidArguments,
Error::NewDfRecordBatch { .. }
| Error::EmptyStream { .. }
| Error::SchemaNotMatch { .. } => StatusCode::InvalidArguments,
Error::DataTypes { .. }
| Error::CreateRecordBatches { .. }

View File

@@ -12,10 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use futures::TryStreamExt;
use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};
use crate::error::Result;
use crate::{RecordBatch, RecordBatches, SendableRecordBatchStream};
use arc_swap::ArcSwapOption;
use datatypes::schema::SchemaRef;
use futures::{Stream, StreamExt, TryStreamExt};
use snafu::ensure;
use crate::adapter::RecordBatchMetrics;
use crate::error::{EmptyStreamSnafu, Result, SchemaNotMatchSnafu};
use crate::{
OrderOption, RecordBatch, RecordBatchStream, RecordBatches, SendableRecordBatchStream,
};
/// Collect all the items from the stream into a vector of [`RecordBatch`].
pub async fn collect(stream: SendableRecordBatchStream) -> Result<Vec<RecordBatch>> {
@@ -29,6 +39,91 @@ pub async fn collect_batches(stream: SendableRecordBatchStream) -> Result<Record
RecordBatches::try_new(schema, batches)
}
/// A stream that chains multiple streams into a single stream.
pub struct ChainedRecordBatchStream {
inputs: Vec<SendableRecordBatchStream>,
curr_index: usize,
schema: SchemaRef,
metrics: Arc<ArcSwapOption<RecordBatchMetrics>>,
}
impl ChainedRecordBatchStream {
pub fn new(inputs: Vec<SendableRecordBatchStream>) -> Result<Self> {
// check length
ensure!(!inputs.is_empty(), EmptyStreamSnafu);
// check schema
let first_schema = inputs[0].schema();
for input in inputs.iter().skip(1) {
let schema = input.schema();
ensure!(
first_schema == schema,
SchemaNotMatchSnafu {
left: first_schema,
right: schema
}
);
}
Ok(Self {
inputs,
curr_index: 0,
schema: first_schema,
metrics: Default::default(),
})
}
fn sequence_poll(
mut self: Pin<&mut Self>,
ctx: &mut Context<'_>,
) -> Poll<Option<Result<RecordBatch>>> {
if self.curr_index >= self.inputs.len() {
return Poll::Ready(None);
}
let curr_index = self.curr_index;
match self.inputs[curr_index].poll_next_unpin(ctx) {
Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(Ok(batch))),
Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
Poll::Ready(None) => {
self.curr_index += 1;
if self.curr_index < self.inputs.len() {
self.sequence_poll(ctx)
} else {
Poll::Ready(None)
}
}
Poll::Pending => Poll::Pending,
}
}
}
impl RecordBatchStream for ChainedRecordBatchStream {
fn name(&self) -> &str {
"ChainedRecordBatchStream"
}
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
self.metrics.load().as_ref().map(|m| m.as_ref().clone())
}
}
impl Stream for ChainedRecordBatchStream {
type Item = Result<RecordBatch>;
fn poll_next(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
self.sequence_poll(ctx)
}
}
#[cfg(test)]
mod tests {
use std::pin::Pin;