mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-20 23:10:37 +00:00
refactor: make table scan return physical plan (#326)
* refactor: return PhysicalPlan in Table trait's scan method, to support partitioned execution in Frontend's distribute read * refactor: pub use necessary DataFusion types * refactor: replace old "PhysicalPlan" and its adapters Co-authored-by: luofucong <luofucong@greptime.com> Co-authored-by: Yingwen <realevenyag@gmail.com>
This commit is contained in:
@@ -4,6 +4,7 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
async-trait = "0.1"
|
||||
common-error = { path = "../error" }
|
||||
common-recordbatch = { path = "../recordbatch" }
|
||||
common-time = { path = "../time" }
|
||||
|
||||
@@ -62,6 +62,36 @@ pub enum InnerError {
|
||||
|
||||
#[snafu(display("unexpected: not constant column"))]
|
||||
InvalidInputCol { backtrace: Backtrace },
|
||||
|
||||
#[snafu(display("Not expected to run ExecutionPlan more than once"))]
|
||||
ExecuteRepeatedly { backtrace: Backtrace },
|
||||
|
||||
#[snafu(display("General DataFusion error, source: {}", source))]
|
||||
GeneralDataFusion {
|
||||
source: DataFusionError,
|
||||
backtrace: Backtrace,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to execute DataFusion ExecutionPlan, source: {}", source))]
|
||||
DataFusionExecutionPlan {
|
||||
source: DataFusionError,
|
||||
backtrace: Backtrace,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Failed to convert DataFusion's recordbatch stream, source: {}",
|
||||
source
|
||||
))]
|
||||
ConvertDfRecordBatchStream {
|
||||
#[snafu(backtrace)]
|
||||
source: common_recordbatch::error::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to convert arrow schema, source: {}", source))]
|
||||
ConvertArrowSchema {
|
||||
#[snafu(backtrace)]
|
||||
source: DataTypeError,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
@@ -76,9 +106,17 @@ impl ErrorExt for InnerError {
|
||||
| InnerError::InvalidInputState { .. }
|
||||
| InnerError::InvalidInputCol { .. }
|
||||
| InnerError::BadAccumulatorImpl { .. } => StatusCode::EngineExecuteQuery,
|
||||
InnerError::InvalidInputs { source, .. } => source.status_code(),
|
||||
InnerError::IntoVector { source, .. } => source.status_code(),
|
||||
InnerError::FromScalarValue { source } => source.status_code(),
|
||||
|
||||
InnerError::InvalidInputs { source, .. }
|
||||
| InnerError::IntoVector { source, .. }
|
||||
| InnerError::FromScalarValue { source }
|
||||
| InnerError::ConvertArrowSchema { source } => source.status_code(),
|
||||
|
||||
InnerError::ExecuteRepeatedly { .. }
|
||||
| InnerError::GeneralDataFusion { .. }
|
||||
| InnerError::DataFusionExecutionPlan { .. } => StatusCode::Unexpected,
|
||||
|
||||
InnerError::ConvertDfRecordBatchStream { source, .. } => source.status_code(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,6 +143,7 @@ impl From<Error> for DataFusionError {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use arrow::error::ArrowError;
|
||||
use snafu::GenerateImplicitData;
|
||||
|
||||
use super::*;
|
||||
@@ -127,6 +166,48 @@ mod tests {
|
||||
.unwrap()
|
||||
.into();
|
||||
assert_error(&err, StatusCode::EngineExecuteQuery);
|
||||
|
||||
let err: Error = throw_df_error()
|
||||
.context(GeneralDataFusionSnafu)
|
||||
.err()
|
||||
.unwrap()
|
||||
.into();
|
||||
assert_error(&err, StatusCode::Unexpected);
|
||||
|
||||
let err: Error = throw_df_error()
|
||||
.context(DataFusionExecutionPlanSnafu)
|
||||
.err()
|
||||
.unwrap()
|
||||
.into();
|
||||
assert_error(&err, StatusCode::Unexpected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_execute_repeatedly_error() {
|
||||
let error: Error = None::<i32>
|
||||
.context(ExecuteRepeatedlySnafu)
|
||||
.err()
|
||||
.unwrap()
|
||||
.into();
|
||||
assert_eq!(error.inner.status_code(), StatusCode::Unexpected);
|
||||
assert!(error.backtrace_opt().is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_df_recordbatch_stream_error() {
|
||||
let result: std::result::Result<i32, common_recordbatch::error::Error> =
|
||||
Err(common_recordbatch::error::InnerError::PollStream {
|
||||
source: ArrowError::Overflow,
|
||||
backtrace: Backtrace::generate(),
|
||||
}
|
||||
.into());
|
||||
let error: Error = result
|
||||
.context(ConvertDfRecordBatchStreamSnafu)
|
||||
.err()
|
||||
.unwrap()
|
||||
.into();
|
||||
assert_eq!(error.inner.status_code(), StatusCode::Internal);
|
||||
assert!(error.backtrace_opt().is_some());
|
||||
}
|
||||
|
||||
fn raise_datatype_error() -> std::result::Result<(), DataTypeError> {
|
||||
|
||||
@@ -4,6 +4,7 @@ pub mod columnar_value;
|
||||
pub mod error;
|
||||
mod function;
|
||||
pub mod logical_plan;
|
||||
pub mod physical_plan;
|
||||
pub mod prelude;
|
||||
mod signature;
|
||||
|
||||
@@ -13,3 +14,5 @@ pub enum Output {
|
||||
RecordBatches(RecordBatches),
|
||||
Stream(SendableRecordBatchStream),
|
||||
}
|
||||
|
||||
pub use datafusion::physical_plan::ExecutionPlan as DfPhysicalPlan;
|
||||
|
||||
@@ -2,7 +2,7 @@ use datafusion::logical_plan::Expr as DfExpr;
|
||||
|
||||
/// Central struct of query API.
|
||||
/// Represent logical expressions such as `A + 1`, or `CAST(c1 AS int)`.
|
||||
#[derive(Clone, PartialEq, Hash)]
|
||||
#[derive(Clone, PartialEq, Hash, Debug)]
|
||||
pub struct Expr {
|
||||
df_expr: DfExpr,
|
||||
}
|
||||
|
||||
325
src/common/query/src/physical_plan.rs
Normal file
325
src/common/query/src/physical_plan.rs
Normal file
@@ -0,0 +1,325 @@
|
||||
use std::any::Any;
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchStreamAdapter};
|
||||
use common_recordbatch::DfSendableRecordBatchStream;
|
||||
use common_recordbatch::SendableRecordBatchStream;
|
||||
use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef;
|
||||
use datafusion::error::Result as DfResult;
|
||||
pub use datafusion::execution::runtime_env::RuntimeEnv;
|
||||
use datafusion::physical_plan::expressions::PhysicalSortExpr;
|
||||
pub use datafusion::physical_plan::Partitioning;
|
||||
use datafusion::physical_plan::Statistics;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::DfPhysicalPlan;
|
||||
|
||||
pub type PhysicalPlanRef = Arc<dyn PhysicalPlan>;
|
||||
|
||||
/// `PhysicalPlan` represent nodes in the Physical Plan.
|
||||
///
|
||||
/// Each `PhysicalPlan` is Partition-aware and is responsible for
|
||||
/// creating the actual `async` [`SendableRecordBatchStream`]s
|
||||
/// of [`RecordBatch`] that incrementally compute the operator's
|
||||
/// output from its input partition.
|
||||
#[async_trait]
|
||||
pub trait PhysicalPlan: Debug + Send + Sync {
|
||||
/// Returns the physical plan as [`Any`](std::any::Any) so that it can be
|
||||
/// downcast to a specific implementation.
|
||||
fn as_any(&self) -> &dyn Any;
|
||||
|
||||
/// Get the schema for this physical plan
|
||||
fn schema(&self) -> SchemaRef;
|
||||
|
||||
/// Specifies the output partitioning scheme of this plan
|
||||
fn output_partitioning(&self) -> Partitioning;
|
||||
|
||||
/// Get a list of child physical plans that provide the input for this plan. The returned list
|
||||
/// will be empty for leaf nodes, will contain a single value for unary nodes, or two
|
||||
/// values for binary nodes (such as joins).
|
||||
fn children(&self) -> Vec<PhysicalPlanRef>;
|
||||
|
||||
/// Returns a new plan where all children were replaced by new plans.
|
||||
/// The size of `children` must be equal to the size of `PhysicalPlan::children()`.
|
||||
fn with_new_children(&self, children: Vec<PhysicalPlanRef>) -> Result<PhysicalPlanRef>;
|
||||
|
||||
/// Creates an RecordBatch stream.
|
||||
async fn execute(
|
||||
&self,
|
||||
partition: usize,
|
||||
runtime: Arc<RuntimeEnv>,
|
||||
) -> Result<SendableRecordBatchStream>;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PhysicalPlanAdapter {
|
||||
schema: SchemaRef,
|
||||
df_plan: Arc<dyn DfPhysicalPlan>,
|
||||
}
|
||||
|
||||
impl PhysicalPlanAdapter {
|
||||
pub fn new(schema: SchemaRef, df_plan: Arc<dyn DfPhysicalPlan>) -> Self {
|
||||
Self { schema, df_plan }
|
||||
}
|
||||
|
||||
pub fn df_plan(&self) -> Arc<dyn DfPhysicalPlan> {
|
||||
self.df_plan.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PhysicalPlan for PhysicalPlanAdapter {
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
fn output_partitioning(&self) -> Partitioning {
|
||||
self.df_plan.output_partitioning()
|
||||
}
|
||||
|
||||
fn children(&self) -> Vec<PhysicalPlanRef> {
|
||||
self.df_plan
|
||||
.children()
|
||||
.into_iter()
|
||||
.map(|x| Arc::new(PhysicalPlanAdapter::new(self.schema(), x)) as _)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn with_new_children(&self, children: Vec<PhysicalPlanRef>) -> Result<PhysicalPlanRef> {
|
||||
let children = children
|
||||
.into_iter()
|
||||
.map(|x| Arc::new(DfPhysicalPlanAdapter(x)) as _)
|
||||
.collect();
|
||||
let plan = self
|
||||
.df_plan
|
||||
.with_new_children(children)
|
||||
.context(error::GeneralDataFusionSnafu)?;
|
||||
Ok(Arc::new(PhysicalPlanAdapter::new(self.schema(), plan)))
|
||||
}
|
||||
|
||||
async fn execute(
|
||||
&self,
|
||||
partition: usize,
|
||||
runtime: Arc<RuntimeEnv>,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
let stream = self
|
||||
.df_plan
|
||||
.execute(partition, runtime)
|
||||
.await
|
||||
.context(error::DataFusionExecutionPlanSnafu)?;
|
||||
let stream = RecordBatchStreamAdapter::try_new(stream)
|
||||
.context(error::ConvertDfRecordBatchStreamSnafu)?;
|
||||
Ok(Box::pin(stream))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DfPhysicalPlanAdapter(pub PhysicalPlanRef);
|
||||
|
||||
#[async_trait]
|
||||
impl DfPhysicalPlan for DfPhysicalPlanAdapter {
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn schema(&self) -> DfSchemaRef {
|
||||
self.0.schema().arrow_schema().clone()
|
||||
}
|
||||
|
||||
fn output_partitioning(&self) -> Partitioning {
|
||||
self.0.output_partitioning()
|
||||
}
|
||||
|
||||
fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
|
||||
None
|
||||
}
|
||||
|
||||
fn children(&self) -> Vec<Arc<dyn DfPhysicalPlan>> {
|
||||
self.0
|
||||
.children()
|
||||
.into_iter()
|
||||
.map(|x| Arc::new(DfPhysicalPlanAdapter(x)) as _)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn with_new_children(
|
||||
&self,
|
||||
children: Vec<Arc<dyn DfPhysicalPlan>>,
|
||||
) -> DfResult<Arc<dyn DfPhysicalPlan>> {
|
||||
let df_schema = self.schema();
|
||||
let schema: SchemaRef = Arc::new(
|
||||
df_schema
|
||||
.try_into()
|
||||
.context(error::ConvertArrowSchemaSnafu)
|
||||
.map_err(error::Error::from)?,
|
||||
);
|
||||
let children = children
|
||||
.into_iter()
|
||||
.map(|x| Arc::new(PhysicalPlanAdapter::new(schema.clone(), x)) as _)
|
||||
.collect();
|
||||
let plan = self.0.with_new_children(children)?;
|
||||
Ok(Arc::new(DfPhysicalPlanAdapter(plan)))
|
||||
}
|
||||
|
||||
async fn execute(
|
||||
&self,
|
||||
partition: usize,
|
||||
runtime: Arc<RuntimeEnv>,
|
||||
) -> DfResult<DfSendableRecordBatchStream> {
|
||||
let stream = self.0.execute(partition, runtime).await?;
|
||||
Ok(Box::pin(DfRecordBatchStreamAdapter::new(stream)))
|
||||
}
|
||||
|
||||
fn statistics(&self) -> Statistics {
|
||||
// TODO(LFC): impl statistics
|
||||
Statistics::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
|
||||
use common_recordbatch::{RecordBatch, RecordBatches};
|
||||
use datafusion::arrow_print;
|
||||
use datafusion::datasource::TableProvider as DfTableProvider;
|
||||
use datafusion::logical_plan::LogicalPlanBuilder;
|
||||
use datafusion::physical_plan::collect;
|
||||
use datafusion::physical_plan::empty::EmptyExec;
|
||||
use datafusion::prelude::ExecutionContext;
|
||||
use datafusion_common::field_util::SchemaExt;
|
||||
use datafusion_expr::Expr;
|
||||
use datatypes::schema::Schema;
|
||||
use datatypes::vectors::Int32Vector;
|
||||
|
||||
use super::*;
|
||||
|
||||
struct MyDfTableProvider;
|
||||
|
||||
#[async_trait]
|
||||
impl DfTableProvider for MyDfTableProvider {
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn schema(&self) -> DfSchemaRef {
|
||||
Arc::new(ArrowSchema::new(vec![Field::new(
|
||||
"a",
|
||||
DataType::Int32,
|
||||
false,
|
||||
)]))
|
||||
}
|
||||
|
||||
async fn scan(
|
||||
&self,
|
||||
_projection: &Option<Vec<usize>>,
|
||||
_filters: &[Expr],
|
||||
_limit: Option<usize>,
|
||||
) -> DfResult<Arc<dyn DfPhysicalPlan>> {
|
||||
let schema = Schema::try_from(self.schema()).unwrap();
|
||||
let my_plan = Arc::new(MyExecutionPlan {
|
||||
schema: Arc::new(schema),
|
||||
});
|
||||
let df_plan = DfPhysicalPlanAdapter(my_plan);
|
||||
Ok(Arc::new(df_plan))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct MyExecutionPlan {
|
||||
schema: SchemaRef,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PhysicalPlan for MyExecutionPlan {
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
fn output_partitioning(&self) -> Partitioning {
|
||||
Partitioning::UnknownPartitioning(1)
|
||||
}
|
||||
|
||||
fn children(&self) -> Vec<PhysicalPlanRef> {
|
||||
vec![]
|
||||
}
|
||||
|
||||
fn with_new_children(&self, _children: Vec<PhysicalPlanRef>) -> Result<PhysicalPlanRef> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn execute(
|
||||
&self,
|
||||
_partition: usize,
|
||||
_runtime: Arc<RuntimeEnv>,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
let schema = self.schema();
|
||||
let recordbatches = RecordBatches::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
RecordBatch::new(
|
||||
schema.clone(),
|
||||
vec![Arc::new(Int32Vector::from_slice(vec![1])) as _],
|
||||
)
|
||||
.unwrap(),
|
||||
RecordBatch::new(
|
||||
schema,
|
||||
vec![Arc::new(Int32Vector::from_slice(vec![2, 3])) as _],
|
||||
)
|
||||
.unwrap(),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
Ok(recordbatches.as_stream())
|
||||
}
|
||||
}
|
||||
|
||||
// Test our physical plan can be executed by DataFusion, through adapters.
|
||||
#[tokio::test]
|
||||
async fn test_execute_physical_plan() {
|
||||
let ctx = ExecutionContext::new();
|
||||
let logical_plan = LogicalPlanBuilder::scan("test", Arc::new(MyDfTableProvider), None)
|
||||
.unwrap()
|
||||
.build()
|
||||
.unwrap();
|
||||
let physical_plan = ctx.create_physical_plan(&logical_plan).await.unwrap();
|
||||
let df_recordbatches = collect(physical_plan, Arc::new(RuntimeEnv::default()))
|
||||
.await
|
||||
.unwrap();
|
||||
let pretty_print = arrow_print::write(&df_recordbatches);
|
||||
let pretty_print = pretty_print.lines().collect::<Vec<&str>>();
|
||||
assert_eq!(
|
||||
pretty_print,
|
||||
vec!["+---+", "| a |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+",]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_physical_plan_adapter() {
|
||||
let df_schema = Arc::new(ArrowSchema::new(vec![Field::new(
|
||||
"name",
|
||||
DataType::Utf8,
|
||||
true,
|
||||
)]));
|
||||
|
||||
let plan = PhysicalPlanAdapter::new(
|
||||
Arc::new(Schema::try_from(df_schema.clone()).unwrap()),
|
||||
Arc::new(EmptyExec::new(true, df_schema.clone())),
|
||||
);
|
||||
assert!(plan.df_plan.as_any().downcast_ref::<EmptyExec>().is_some());
|
||||
|
||||
let df_plan = DfPhysicalPlanAdapter(Arc::new(plan));
|
||||
assert_eq!(df_schema, df_plan.schema());
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
common-error = { path = "../error" }
|
||||
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2", features = ["simd"] }
|
||||
datafusion-common = { git = "https://github.com/apache/arrow-datafusion.git", branch = "arrow2" }
|
||||
datatypes = { path = "../../datatypes" }
|
||||
futures = "0.3"
|
||||
|
||||
92
src/common/recordbatch/src/adapter.rs
Normal file
92
src/common/recordbatch/src/adapter.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef;
|
||||
use datafusion::physical_plan::RecordBatchStream as DfRecordBatchStream;
|
||||
use datafusion_common::record_batch::RecordBatch as DfRecordBatch;
|
||||
use datatypes::arrow::error::ArrowError;
|
||||
use datatypes::arrow::error::Result as ArrowResult;
|
||||
use datatypes::schema::{Schema, SchemaRef};
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::DfSendableRecordBatchStream;
|
||||
use crate::{RecordBatch, RecordBatchStream, SendableRecordBatchStream, Stream};
|
||||
|
||||
/// Greptime SendableRecordBatchStream -> DataFusion RecordBatchStream
|
||||
pub struct DfRecordBatchStreamAdapter {
|
||||
stream: SendableRecordBatchStream,
|
||||
}
|
||||
|
||||
impl DfRecordBatchStreamAdapter {
|
||||
pub fn new(stream: SendableRecordBatchStream) -> Self {
|
||||
Self { stream }
|
||||
}
|
||||
}
|
||||
|
||||
impl DfRecordBatchStream for DfRecordBatchStreamAdapter {
|
||||
fn schema(&self) -> DfSchemaRef {
|
||||
self.stream.schema().arrow_schema().clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for DfRecordBatchStreamAdapter {
|
||||
type Item = ArrowResult<DfRecordBatch>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
match Pin::new(&mut self.stream).poll_next(cx) {
|
||||
Poll::Pending => Poll::Pending,
|
||||
Poll::Ready(Some(recordbatch)) => match recordbatch {
|
||||
Ok(recordbatch) => Poll::Ready(Some(Ok(recordbatch.df_recordbatch))),
|
||||
Err(e) => Poll::Ready(Some(Err(ArrowError::External("".to_owned(), Box::new(e))))),
|
||||
},
|
||||
Poll::Ready(None) => Poll::Ready(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.stream.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
/// DataFusion SendableRecordBatchStream -> Greptime RecordBatchStream
|
||||
pub struct RecordBatchStreamAdapter {
|
||||
schema: SchemaRef,
|
||||
stream: DfSendableRecordBatchStream,
|
||||
}
|
||||
|
||||
impl RecordBatchStreamAdapter {
|
||||
pub fn try_new(stream: DfSendableRecordBatchStream) -> Result<Self> {
|
||||
let schema =
|
||||
Arc::new(Schema::try_from(stream.schema()).context(error::SchemaConversionSnafu)?);
|
||||
Ok(Self { schema, stream })
|
||||
}
|
||||
}
|
||||
|
||||
impl RecordBatchStream for RecordBatchStreamAdapter {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.schema.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for RecordBatchStreamAdapter {
|
||||
type Item = Result<RecordBatch>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
match Pin::new(&mut self.stream).poll_next(cx) {
|
||||
Poll::Pending => Poll::Pending,
|
||||
Poll::Ready(Some(df_recordbatch)) => Poll::Ready(Some(Ok(RecordBatch {
|
||||
schema: self.schema(),
|
||||
df_recordbatch: df_recordbatch.context(error::PollStreamSnafu)?,
|
||||
}))),
|
||||
Poll::Ready(None) => Poll::Ready(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.stream.size_hint()
|
||||
}
|
||||
}
|
||||
@@ -33,16 +33,32 @@ pub enum InnerError {
|
||||
reason: String,
|
||||
backtrace: Backtrace,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to convert Arrow schema, source: {}", source))]
|
||||
SchemaConversion {
|
||||
source: datatypes::error::Error,
|
||||
backtrace: Backtrace,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to poll stream, source: {}", source))]
|
||||
PollStream {
|
||||
source: datatypes::arrow::error::ArrowError,
|
||||
backtrace: Backtrace,
|
||||
},
|
||||
}
|
||||
|
||||
impl ErrorExt for InnerError {
|
||||
fn status_code(&self) -> StatusCode {
|
||||
match self {
|
||||
InnerError::NewDfRecordBatch { .. } => StatusCode::InvalidArguments,
|
||||
InnerError::DataTypes { .. } | InnerError::CreateRecordBatches { .. } => {
|
||||
StatusCode::Internal
|
||||
}
|
||||
|
||||
InnerError::DataTypes { .. }
|
||||
| InnerError::CreateRecordBatches { .. }
|
||||
| InnerError::PollStream { .. } => StatusCode::Internal,
|
||||
|
||||
InnerError::External { source } => source.status_code(),
|
||||
|
||||
InnerError::SchemaConversion { source, .. } => source.status_code(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
pub mod adapter;
|
||||
pub mod error;
|
||||
mod recordbatch;
|
||||
pub mod util;
|
||||
|
||||
use std::pin::Pin;
|
||||
|
||||
pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use error::Result;
|
||||
use futures::task::{Context, Poll};
|
||||
@@ -74,6 +76,41 @@ impl RecordBatches {
|
||||
pub fn take(self) -> Vec<RecordBatch> {
|
||||
self.batches
|
||||
}
|
||||
|
||||
pub fn as_stream(&self) -> SendableRecordBatchStream {
|
||||
Box::pin(SimpleRecordBatchStream {
|
||||
inner: RecordBatches {
|
||||
schema: self.schema(),
|
||||
batches: self.batches.clone(),
|
||||
},
|
||||
index: 0,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SimpleRecordBatchStream {
|
||||
inner: RecordBatches,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl RecordBatchStream for SimpleRecordBatchStream {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.inner.schema()
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for SimpleRecordBatchStream {
|
||||
type Item = Result<RecordBatch>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
Poll::Ready(if self.index < self.inner.batches.len() {
|
||||
let batch = self.inner.batches[self.index].clone();
|
||||
self.index += 1;
|
||||
Some(Ok(batch))
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -116,4 +153,27 @@ mod tests {
|
||||
assert_eq!(schema1, batches.schema());
|
||||
assert_eq!(vec![batch1], batches.take());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_recordbatch_stream() {
|
||||
let column_a = ColumnSchema::new("a", ConcreteDataType::int32_datatype(), false);
|
||||
let column_b = ColumnSchema::new("b", ConcreteDataType::string_datatype(), false);
|
||||
let schema = Arc::new(Schema::new(vec![column_a, column_b]));
|
||||
|
||||
let va1: VectorRef = Arc::new(Int32Vector::from_slice(&[1, 2]));
|
||||
let vb1: VectorRef = Arc::new(StringVector::from(vec!["a", "b"]));
|
||||
let batch1 = RecordBatch::new(schema.clone(), vec![va1, vb1]).unwrap();
|
||||
|
||||
let va2: VectorRef = Arc::new(Int32Vector::from_slice(&[3, 4, 5]));
|
||||
let vb2: VectorRef = Arc::new(StringVector::from(vec!["c", "d", "e"]));
|
||||
let batch2 = RecordBatch::new(schema.clone(), vec![va2, vb2]).unwrap();
|
||||
|
||||
let recordbatches =
|
||||
RecordBatches::try_new(schema.clone(), vec![batch1.clone(), batch2.clone()]).unwrap();
|
||||
let stream = recordbatches.as_stream();
|
||||
let collected = util::collect(stream).await.unwrap();
|
||||
assert_eq!(collected.len(), 2);
|
||||
assert_eq!(collected[0], batch1);
|
||||
assert_eq!(collected[1], batch2);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user