refactor: make table scan return physical plan (#326)

* refactor: return PhysicalPlan in Table trait's scan method, to support partitioned execution in Frontend's distribute read * refactor: pub use necessary DataFusion types * refactor: replace old "PhysicalPlan" and its adapters Co-authored-by: luofucong <luofucong@greptime.com> Co-authored-by: Yingwen <realevenyag@gmail.com>
2026-07-07 14:30:39 +00:00 · 2022-10-25 11:34:53 +08:00
parent 64dac51e83
commit 2ca667cbdf
39 changed files with 920 additions and 600 deletions
--- a/src/common/query/Cargo.toml
+++ b/src/common/query/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
+async-trait = "0.1"
 common-error = { path = "../error" }
 common-recordbatch = { path = "../recordbatch" }
 common-time = { path = "../time" }
--- a/src/common/query/src/error.rs
+++ b/src/common/query/src/error.rs
@@ -62,6 +62,36 @@ pub enum InnerError {

    #[snafu(display("unexpected: not constant column"))]
    InvalidInputCol { backtrace: Backtrace },
+
+    #[snafu(display("Not expected to run ExecutionPlan more than once"))]
+    ExecuteRepeatedly { backtrace: Backtrace },
+
+    #[snafu(display("General DataFusion error, source: {}", source))]
+    GeneralDataFusion {
+        source: DataFusionError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Failed to execute DataFusion ExecutionPlan, source: {}", source))]
+    DataFusionExecutionPlan {
+        source: DataFusionError,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display(
+        "Failed to convert DataFusion's recordbatch stream, source: {}",
+        source
+    ))]
+    ConvertDfRecordBatchStream {
+        #[snafu(backtrace)]
+        source: common_recordbatch::error::Error,
+    },
+
+    #[snafu(display("Failed to convert arrow schema, source: {}", source))]
+    ConvertArrowSchema {
+        #[snafu(backtrace)]
+        source: DataTypeError,
+    },
 }

 pub type Result<T> = std::result::Result<T, Error>;
@@ -76,9 +106,17 @@ impl ErrorExt for InnerError {
            | InnerError::InvalidInputState { .. }
            | InnerError::InvalidInputCol { .. }
            | InnerError::BadAccumulatorImpl { .. } => StatusCode::EngineExecuteQuery,
-            InnerError::InvalidInputs { source, .. } => source.status_code(),
-            InnerError::IntoVector { source, .. } => source.status_code(),
-            InnerError::FromScalarValue { source } => source.status_code(),
+
+            InnerError::InvalidInputs { source, .. }
+            | InnerError::IntoVector { source, .. }
+            | InnerError::FromScalarValue { source }
+            | InnerError::ConvertArrowSchema { source } => source.status_code(),
+
+            InnerError::ExecuteRepeatedly { .. }
+            | InnerError::GeneralDataFusion { .. }
+            | InnerError::DataFusionExecutionPlan { .. } => StatusCode::Unexpected,
+
+            InnerError::ConvertDfRecordBatchStream { source, .. } => source.status_code(),
        }
    }

@@ -105,6 +143,7 @@ impl From<Error> for DataFusionError {

 #[cfg(test)]
 mod tests {
+    use arrow::error::ArrowError;
    use snafu::GenerateImplicitData;

    use super::*;
@@ -127,6 +166,48 @@ mod tests {
            .unwrap()
            .into();
        assert_error(&err, StatusCode::EngineExecuteQuery);
+
+        let err: Error = throw_df_error()
+            .context(GeneralDataFusionSnafu)
+            .err()
+            .unwrap()
+            .into();
+        assert_error(&err, StatusCode::Unexpected);
+
+        let err: Error = throw_df_error()
+            .context(DataFusionExecutionPlanSnafu)
+            .err()
+            .unwrap()
+            .into();
+        assert_error(&err, StatusCode::Unexpected);
+    }
+
+    #[test]
+    fn test_execute_repeatedly_error() {
+        let error: Error = None::<i32>
+            .context(ExecuteRepeatedlySnafu)
+            .err()
+            .unwrap()
+            .into();
+        assert_eq!(error.inner.status_code(), StatusCode::Unexpected);
+        assert!(error.backtrace_opt().is_some());
+    }
+
+    #[test]
+    fn test_convert_df_recordbatch_stream_error() {
+        let result: std::result::Result<i32, common_recordbatch::error::Error> =
+            Err(common_recordbatch::error::InnerError::PollStream {
+                source: ArrowError::Overflow,
+                backtrace: Backtrace::generate(),
+            }
+            .into());
+        let error: Error = result
+            .context(ConvertDfRecordBatchStreamSnafu)
+            .err()
+            .unwrap()
+            .into();
+        assert_eq!(error.inner.status_code(), StatusCode::Internal);
+        assert!(error.backtrace_opt().is_some());
    }

    fn raise_datatype_error() -> std::result::Result<(), DataTypeError> {
--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -4,6 +4,7 @@ pub mod columnar_value;
 pub mod error;
 mod function;
 pub mod logical_plan;
+pub mod physical_plan;
 pub mod prelude;
 mod signature;

@@ -13,3 +14,5 @@ pub enum Output {
    RecordBatches(RecordBatches),
    Stream(SendableRecordBatchStream),
 }
+
+pub use datafusion::physical_plan::ExecutionPlan as DfPhysicalPlan;
--- a/src/common/query/src/logical_plan/expr.rs
+++ b/src/common/query/src/logical_plan/expr.rs
@@ -2,7 +2,7 @@ use datafusion::logical_plan::Expr as DfExpr;

 /// Central struct of query API.
 /// Represent logical expressions such as `A + 1`, or `CAST(c1 AS int)`.
-#[derive(Clone, PartialEq, Hash)]
+#[derive(Clone, PartialEq, Hash, Debug)]
 pub struct Expr {
    df_expr: DfExpr,
 }
--- a/src/common/query/src/physical_plan.rs
+++ b/src/common/query/src/physical_plan.rs
@@ -0,0 +1,325 @@
+use std::any::Any;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchStreamAdapter};
+use common_recordbatch::DfSendableRecordBatchStream;
+use common_recordbatch::SendableRecordBatchStream;
+use datafusion::arrow::datatypes::SchemaRef as DfSchemaRef;
+use datafusion::error::Result as DfResult;
+pub use datafusion::execution::runtime_env::RuntimeEnv;
+use datafusion::physical_plan::expressions::PhysicalSortExpr;
+pub use datafusion::physical_plan::Partitioning;
+use datafusion::physical_plan::Statistics;
+use datatypes::schema::SchemaRef;
+use snafu::ResultExt;
+
+use crate::error::{self, Result};
+use crate::DfPhysicalPlan;
+
+pub type PhysicalPlanRef = Arc<dyn PhysicalPlan>;
+
+/// `PhysicalPlan` represent nodes in the Physical Plan.
+///
+/// Each `PhysicalPlan` is Partition-aware and is responsible for
+/// creating the actual `async` [`SendableRecordBatchStream`]s
+/// of [`RecordBatch`] that incrementally compute the operator's
+/// output from its input partition.
+#[async_trait]
+pub trait PhysicalPlan: Debug + Send + Sync {
+    /// Returns the physical plan as [`Any`](std::any::Any) so that it can be
+    /// downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+
+    /// Get the schema for this physical plan
+    fn schema(&self) -> SchemaRef;
+
+    /// Specifies the output partitioning scheme of this plan
+    fn output_partitioning(&self) -> Partitioning;
+
+    /// Get a list of child physical plans that provide the input for this plan. The returned list
+    /// will be empty for leaf nodes, will contain a single value for unary nodes, or two
+    /// values for binary nodes (such as joins).
+    fn children(&self) -> Vec<PhysicalPlanRef>;
+
+    /// Returns a new plan where all children were replaced by new plans.
+    /// The size of `children` must be equal to the size of `PhysicalPlan::children()`.
+    fn with_new_children(&self, children: Vec<PhysicalPlanRef>) -> Result<PhysicalPlanRef>;
+
+    /// Creates an RecordBatch stream.
+    async fn execute(
+        &self,
+        partition: usize,
+        runtime: Arc<RuntimeEnv>,
+    ) -> Result<SendableRecordBatchStream>;
+}
+
+#[derive(Debug)]
+pub struct PhysicalPlanAdapter {
+    schema: SchemaRef,
+    df_plan: Arc<dyn DfPhysicalPlan>,
+}
+
+impl PhysicalPlanAdapter {
+    pub fn new(schema: SchemaRef, df_plan: Arc<dyn DfPhysicalPlan>) -> Self {
+        Self { schema, df_plan }
+    }
+
+    pub fn df_plan(&self) -> Arc<dyn DfPhysicalPlan> {
+        self.df_plan.clone()
+    }
+}
+
+#[async_trait]
+impl PhysicalPlan for PhysicalPlanAdapter {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        self.df_plan.output_partitioning()
+    }
+
+    fn children(&self) -> Vec<PhysicalPlanRef> {
+        self.df_plan
+            .children()
+            .into_iter()
+            .map(|x| Arc::new(PhysicalPlanAdapter::new(self.schema(), x)) as _)
+            .collect()
+    }
+
+    fn with_new_children(&self, children: Vec<PhysicalPlanRef>) -> Result<PhysicalPlanRef> {
+        let children = children
+            .into_iter()
+            .map(|x| Arc::new(DfPhysicalPlanAdapter(x)) as _)
+            .collect();
+        let plan = self
+            .df_plan
+            .with_new_children(children)
+            .context(error::GeneralDataFusionSnafu)?;
+        Ok(Arc::new(PhysicalPlanAdapter::new(self.schema(), plan)))
+    }
+
+    async fn execute(
+        &self,
+        partition: usize,
+        runtime: Arc<RuntimeEnv>,
+    ) -> Result<SendableRecordBatchStream> {
+        let stream = self
+            .df_plan
+            .execute(partition, runtime)
+            .await
+            .context(error::DataFusionExecutionPlanSnafu)?;
+        let stream = RecordBatchStreamAdapter::try_new(stream)
+            .context(error::ConvertDfRecordBatchStreamSnafu)?;
+        Ok(Box::pin(stream))
+    }
+}
+
+#[derive(Debug)]
+pub struct DfPhysicalPlanAdapter(pub PhysicalPlanRef);
+
+#[async_trait]
+impl DfPhysicalPlan for DfPhysicalPlanAdapter {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> DfSchemaRef {
+        self.0.schema().arrow_schema().clone()
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        self.0.output_partitioning()
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn children(&self) -> Vec<Arc<dyn DfPhysicalPlan>> {
+        self.0
+            .children()
+            .into_iter()
+            .map(|x| Arc::new(DfPhysicalPlanAdapter(x)) as _)
+            .collect()
+    }
+
+    fn with_new_children(
+        &self,
+        children: Vec<Arc<dyn DfPhysicalPlan>>,
+    ) -> DfResult<Arc<dyn DfPhysicalPlan>> {
+        let df_schema = self.schema();
+        let schema: SchemaRef = Arc::new(
+            df_schema
+                .try_into()
+                .context(error::ConvertArrowSchemaSnafu)
+                .map_err(error::Error::from)?,
+        );
+        let children = children
+            .into_iter()
+            .map(|x| Arc::new(PhysicalPlanAdapter::new(schema.clone(), x)) as _)
+            .collect();
+        let plan = self.0.with_new_children(children)?;
+        Ok(Arc::new(DfPhysicalPlanAdapter(plan)))
+    }
+
+    async fn execute(
+        &self,
+        partition: usize,
+        runtime: Arc<RuntimeEnv>,
+    ) -> DfResult<DfSendableRecordBatchStream> {
+        let stream = self.0.execute(partition, runtime).await?;
+        Ok(Box::pin(DfRecordBatchStreamAdapter::new(stream)))
+    }
+
+    fn statistics(&self) -> Statistics {
+        // TODO(LFC): impl statistics
+        Statistics::default()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use common_recordbatch::{RecordBatch, RecordBatches};
+    use datafusion::arrow_print;
+    use datafusion::datasource::TableProvider as DfTableProvider;
+    use datafusion::logical_plan::LogicalPlanBuilder;
+    use datafusion::physical_plan::collect;
+    use datafusion::physical_plan::empty::EmptyExec;
+    use datafusion::prelude::ExecutionContext;
+    use datafusion_common::field_util::SchemaExt;
+    use datafusion_expr::Expr;
+    use datatypes::schema::Schema;
+    use datatypes::vectors::Int32Vector;
+
+    use super::*;
+
+    struct MyDfTableProvider;
+
+    #[async_trait]
+    impl DfTableProvider for MyDfTableProvider {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn schema(&self) -> DfSchemaRef {
+            Arc::new(ArrowSchema::new(vec![Field::new(
+                "a",
+                DataType::Int32,
+                false,
+            )]))
+        }
+
+        async fn scan(
+            &self,
+            _projection: &Option<Vec<usize>>,
+            _filters: &[Expr],
+            _limit: Option<usize>,
+        ) -> DfResult<Arc<dyn DfPhysicalPlan>> {
+            let schema = Schema::try_from(self.schema()).unwrap();
+            let my_plan = Arc::new(MyExecutionPlan {
+                schema: Arc::new(schema),
+            });
+            let df_plan = DfPhysicalPlanAdapter(my_plan);
+            Ok(Arc::new(df_plan))
+        }
+    }
+
+    #[derive(Debug)]
+    struct MyExecutionPlan {
+        schema: SchemaRef,
+    }
+
+    #[async_trait]
+    impl PhysicalPlan for MyExecutionPlan {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            self.schema.clone()
+        }
+
+        fn output_partitioning(&self) -> Partitioning {
+            Partitioning::UnknownPartitioning(1)
+        }
+
+        fn children(&self) -> Vec<PhysicalPlanRef> {
+            vec![]
+        }
+
+        fn with_new_children(&self, _children: Vec<PhysicalPlanRef>) -> Result<PhysicalPlanRef> {
+            unimplemented!()
+        }
+
+        async fn execute(
+            &self,
+            _partition: usize,
+            _runtime: Arc<RuntimeEnv>,
+        ) -> Result<SendableRecordBatchStream> {
+            let schema = self.schema();
+            let recordbatches = RecordBatches::try_new(
+                schema.clone(),
+                vec![
+                    RecordBatch::new(
+                        schema.clone(),
+                        vec![Arc::new(Int32Vector::from_slice(vec![1])) as _],
+                    )
+                    .unwrap(),
+                    RecordBatch::new(
+                        schema,
+                        vec![Arc::new(Int32Vector::from_slice(vec![2, 3])) as _],
+                    )
+                    .unwrap(),
+                ],
+            )
+            .unwrap();
+            Ok(recordbatches.as_stream())
+        }
+    }
+
+    // Test our physical plan can be executed by DataFusion, through adapters.
+    #[tokio::test]
+    async fn test_execute_physical_plan() {
+        let ctx = ExecutionContext::new();
+        let logical_plan = LogicalPlanBuilder::scan("test", Arc::new(MyDfTableProvider), None)
+            .unwrap()
+            .build()
+            .unwrap();
+        let physical_plan = ctx.create_physical_plan(&logical_plan).await.unwrap();
+        let df_recordbatches = collect(physical_plan, Arc::new(RuntimeEnv::default()))
+            .await
+            .unwrap();
+        let pretty_print = arrow_print::write(&df_recordbatches);
+        let pretty_print = pretty_print.lines().collect::<Vec<&str>>();
+        assert_eq!(
+            pretty_print,
+            vec!["+---+", "| a |", "+---+", "| 1 |", "| 2 |", "| 3 |", "+---+",]
+        );
+    }
+
+    #[test]
+    fn test_physical_plan_adapter() {
+        let df_schema = Arc::new(ArrowSchema::new(vec![Field::new(
+            "name",
+            DataType::Utf8,
+            true,
+        )]));
+
+        let plan = PhysicalPlanAdapter::new(
+            Arc::new(Schema::try_from(df_schema.clone()).unwrap()),
+            Arc::new(EmptyExec::new(true, df_schema.clone())),
+        );
+        assert!(plan.df_plan.as_any().downcast_ref::<EmptyExec>().is_some());
+
+        let df_plan = DfPhysicalPlanAdapter(Arc::new(plan));
+        assert_eq!(df_schema, df_plan.schema());
+    }
+}