refactor: Remove PhysicalPlan trait and use ExecutionPlan directly (#3894)

* refactor: remove PhysicalPlan

* refactor: remove physical_plan mod

* refactor: import

* fix merge error

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Yingwen
2024-05-11 15:38:03 +08:00
committed by GitHub
parent fa6c371380
commit d0820bb26d
28 changed files with 174 additions and 567 deletions

View File

@@ -21,7 +21,6 @@ use std::sync::Arc;
use arrow::array::{StringBuilder, UInt32Builder};
use arrow_schema::{DataType, Field, Schema, SchemaRef};
use common_query::{DfPhysicalPlan, DfPhysicalPlanRef};
use common_recordbatch::adapter::{MetricCollector, RecordBatchMetrics};
use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream};
use datafusion::error::Result as DfResult;
@@ -29,7 +28,7 @@ use datafusion::execution::TaskContext;
use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
use datafusion::physical_plan::{
accept, DisplayAs, DisplayFormatType, ExecutionPlanProperties, PlanProperties,
accept, DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
};
use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
use datafusion_common::{internal_err, DataFusionError};
@@ -44,14 +43,14 @@ const PLAN: &str = "plan";
#[derive(Debug)]
pub struct DistAnalyzeExec {
input: DfPhysicalPlanRef,
input: Arc<dyn ExecutionPlan>,
schema: SchemaRef,
properties: PlanProperties,
}
impl DistAnalyzeExec {
/// Create a new DistAnalyzeExec
pub fn new(input: DfPhysicalPlanRef) -> Self {
pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
let schema = SchemaRef::new(Schema::new(vec![
Field::new(STAGE, DataType::UInt32, true),
Field::new(NODE, DataType::UInt32, true),
@@ -66,7 +65,7 @@ impl DistAnalyzeExec {
}
/// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
fn compute_properties(input: &DfPhysicalPlanRef, schema: SchemaRef) -> PlanProperties {
fn compute_properties(input: &Arc<dyn ExecutionPlan>, schema: SchemaRef) -> PlanProperties {
let eq_properties = EquivalenceProperties::new(schema);
let output_partitioning = Partitioning::UnknownPartitioning(1);
let exec_mode = input.execution_mode();
@@ -84,7 +83,7 @@ impl DisplayAs for DistAnalyzeExec {
}
}
impl DfPhysicalPlan for DistAnalyzeExec {
impl ExecutionPlan for DistAnalyzeExec {
fn name(&self) -> &'static str {
"DistAnalyzeExec"
}
@@ -98,7 +97,7 @@ impl DfPhysicalPlan for DistAnalyzeExec {
&self.properties
}
fn children(&self) -> Vec<DfPhysicalPlanRef> {
fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
vec![self.input.clone()]
}
@@ -109,8 +108,8 @@ impl DfPhysicalPlan for DistAnalyzeExec {
fn with_new_children(
self: Arc<Self>,
mut children: Vec<DfPhysicalPlanRef>,
) -> DfResult<DfPhysicalPlanRef> {
mut children: Vec<Arc<dyn ExecutionPlan>>,
) -> DfResult<Arc<dyn ExecutionPlan>> {
Ok(Arc::new(Self::new(children.pop().unwrap())))
}
@@ -196,7 +195,7 @@ impl AnalyzeOutputBuilder {
/// Creates the output of AnalyzeExec as a RecordBatch
fn create_output_batch(
total_rows: usize,
input: DfPhysicalPlanRef,
input: Arc<dyn ExecutionPlan>,
schema: SchemaRef,
) -> DfResult<DfRecordBatch> {
let mut builder = AnalyzeOutputBuilder::new(schema);

View File

@@ -26,7 +26,6 @@ use common_base::Plugins;
use common_error::ext::BoxedError;
use common_function::function::FunctionRef;
use common_function::scalars::aggregate::AggregateFunctionMetaRef;
use common_query::physical_plan::{DfPhysicalPlanAdapter, PhysicalPlan, PhysicalPlanAdapter};
use common_query::prelude::ScalarUdf;
use common_query::{Output, OutputData, OutputMeta};
use common_recordbatch::adapter::RecordBatchStreamAdapter;
@@ -38,6 +37,7 @@ use datafusion::physical_plan::ExecutionPlan;
use datafusion_common::ResolvedTableReference;
use datafusion_expr::{DmlStatement, LogicalPlan as DfLogicalPlan, WriteOp};
use datatypes::prelude::VectorRef;
use datatypes::schema::Schema;
use futures_util::StreamExt;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
@@ -352,7 +352,7 @@ impl PhysicalPlanner for DatafusionQueryEngine {
&self,
ctx: &mut QueryEngineContext,
logical_plan: &LogicalPlan,
) -> Result<Arc<dyn PhysicalPlan>> {
) -> Result<Arc<dyn ExecutionPlan>> {
let _timer = metrics::CREATE_PHYSICAL_ELAPSED.start_timer();
match logical_plan {
LogicalPlan::DfPlan(df_plan) => {
@@ -364,17 +364,7 @@ impl PhysicalPlanner for DatafusionQueryEngine {
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
Ok(Arc::new(PhysicalPlanAdapter::new(
Arc::new(
physical_plan
.schema()
.try_into()
.context(error::ConvertSchemaSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?,
),
physical_plan,
)))
Ok(physical_plan)
}
}
}
@@ -385,44 +375,33 @@ impl PhysicalOptimizer for DatafusionQueryEngine {
fn optimize_physical_plan(
&self,
ctx: &mut QueryEngineContext,
plan: Arc<dyn PhysicalPlan>,
) -> Result<Arc<dyn PhysicalPlan>> {
plan: Arc<dyn ExecutionPlan>,
) -> Result<Arc<dyn ExecutionPlan>> {
let _timer = metrics::OPTIMIZE_PHYSICAL_ELAPSED.start_timer();
let state = ctx.state();
let config = state.config_options();
let df_plan = plan
.as_any()
.downcast_ref::<PhysicalPlanAdapter>()
.context(error::PhysicalPlanDowncastSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?
.df_plan();
// skip optimize AnalyzeExec plan
let optimized_plan =
if let Some(analyze_plan) = df_plan.as_any().downcast_ref::<AnalyzeExec>() {
let mut new_plan = analyze_plan.input().clone();
for optimizer in state.physical_optimizers() {
new_plan = optimizer
.optimize(new_plan, config)
.context(DataFusionSnafu)?;
}
Arc::new(DistAnalyzeExec::new(new_plan))
} else {
let mut new_plan = df_plan;
for optimizer in state.physical_optimizers() {
new_plan = optimizer
.optimize(new_plan, config)
.context(DataFusionSnafu)?;
}
new_plan
};
let optimized_plan = if let Some(analyze_plan) = plan.as_any().downcast_ref::<AnalyzeExec>()
{
let mut new_plan = analyze_plan.input().clone();
for optimizer in state.physical_optimizers() {
new_plan = optimizer
.optimize(new_plan, config)
.context(DataFusionSnafu)?;
}
Arc::new(DistAnalyzeExec::new(new_plan))
} else {
let mut new_plan = plan;
for optimizer in state.physical_optimizers() {
new_plan = optimizer
.optimize(new_plan, config)
.context(DataFusionSnafu)?;
}
new_plan
};
Ok(Arc::new(PhysicalPlanAdapter::new(
plan.schema(),
optimized_plan,
)))
Ok(optimized_plan)
}
}
@@ -431,30 +410,21 @@ impl QueryExecutor for DatafusionQueryEngine {
fn execute_stream(
&self,
ctx: &QueryEngineContext,
plan: &Arc<dyn PhysicalPlan>,
plan: &Arc<dyn ExecutionPlan>,
) -> Result<SendableRecordBatchStream> {
let exec_timer = metrics::EXEC_PLAN_ELAPSED.start_timer();
let task_ctx = ctx.build_task_ctx();
match plan.properties().output_partitioning().partition_count() {
0 => Ok(Box::pin(EmptyRecordBatchStream::new(plan.schema()))),
1 => {
let stream = plan
.execute(0, task_ctx)
.context(error::ExecutePhysicalPlanSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let stream = OnDone::new(stream, move || {
exec_timer.observe_duration();
});
Ok(Box::pin(stream))
0 => {
let schema = Arc::new(
Schema::try_from(plan.schema())
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?,
);
Ok(Box::pin(EmptyRecordBatchStream::new(schema)))
}
_ => {
let df_plan = Arc::new(DfPhysicalPlanAdapter(plan.clone()));
// merge into a single partition
let plan = CoalescePartitionsExec::new(df_plan.clone());
// CoalescePartitionsExec must produce a single partition
assert_eq!(1, plan.properties().output_partitioning().partition_count());
1 => {
let df_stream = plan
.execute(0, task_ctx)
.context(error::DatafusionSnafu)
@@ -464,7 +434,33 @@ impl QueryExecutor for DatafusionQueryEngine {
.context(error::ConvertDfRecordBatchStreamSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
stream.set_metrics2(df_plan);
stream.set_metrics2(plan.clone());
let stream = OnDone::new(Box::pin(stream), move || {
exec_timer.observe_duration();
});
Ok(Box::pin(stream))
}
_ => {
// merge into a single partition
let merged_plan = CoalescePartitionsExec::new(plan.clone());
// CoalescePartitionsExec must produce a single partition
assert_eq!(
1,
merged_plan
.properties()
.output_partitioning()
.partition_count()
);
let df_stream = merged_plan
.execute(0, task_ctx)
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let mut stream = RecordBatchStreamAdapter::try_new(df_stream)
.context(error::ConvertDfRecordBatchStreamSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
stream.set_metrics2(plan.clone());
let stream = OnDone::new(Box::pin(stream), move || {
exec_timer.observe_duration();
});

View File

@@ -52,13 +52,6 @@ pub enum InnerError {
location: Location,
source: common_recordbatch::error::Error,
},
#[snafu(display("Failed to execute physical plan"))]
ExecutePhysicalPlan {
#[snafu(implicit)]
location: Location,
source: common_query::error::Error,
},
}
impl ErrorExt for InnerError {
@@ -70,7 +63,6 @@ impl ErrorExt for InnerError {
Datafusion { .. } => StatusCode::EngineExecuteQuery,
PhysicalPlanDowncast { .. } | ConvertSchema { .. } => StatusCode::Unexpected,
ConvertDfRecordBatchStream { source, .. } => source.status_code(),
ExecutePhysicalPlan { source, .. } => source.status_code(),
}
}

View File

@@ -23,13 +23,13 @@ use common_catalog::parse_catalog_and_schema_from_db_string;
use common_error::ext::BoxedError;
use common_meta::table_name::TableName;
use common_plugins::GREPTIME_EXEC_READ_COST;
use common_query::physical_plan::TaskContext;
use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchMetrics};
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::{
DfSendableRecordBatchStream, RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream,
};
use common_telemetry::tracing_context::TracingContext;
use datafusion::execution::TaskContext;
use datafusion::physical_plan::metrics::{
Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, MetricsSet, Time,
};

View File

@@ -18,13 +18,12 @@ use std::any::Any;
use std::sync::{Arc, Mutex};
use async_trait::async_trait;
use common_query::physical_plan::DfPhysicalPlanAdapter;
use common_query::DfPhysicalPlan;
use common_recordbatch::OrderOption;
use datafusion::catalog::schema::SchemaProvider;
use datafusion::catalog::{CatalogProvider, CatalogProviderList};
use datafusion::datasource::TableProvider;
use datafusion::execution::context::SessionState;
use datafusion::physical_plan::ExecutionPlan;
use datafusion_common::DataFusionError;
use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType};
use datatypes::arrow::datatypes::SchemaRef;
@@ -157,7 +156,7 @@ impl TableProvider for DummyTableProvider {
projection: Option<&Vec<usize>>,
filters: &[Expr],
limit: Option<usize>,
) -> datafusion::error::Result<Arc<dyn DfPhysicalPlan>> {
) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
let mut request = self.scan_request.lock().unwrap().clone();
request.projection = match projection {
Some(x) if !x.is_empty() => Some(x.clone()),
@@ -174,9 +173,7 @@ impl TableProvider for DummyTableProvider {
.handle_query(self.region_id, request)
.await
.map_err(|e| DataFusionError::External(Box::new(e)))?;
Ok(Arc::new(DfPhysicalPlanAdapter(Arc::new(
StreamScanAdapter::new(stream),
))))
Ok(Arc::new(StreamScanAdapter::new(stream)))
}
fn supports_filters_pushdown(

View File

@@ -14,8 +14,8 @@
use std::sync::Arc;
use common_query::physical_plan::PhysicalPlan;
use common_recordbatch::SendableRecordBatchStream;
use datafusion::physical_plan::ExecutionPlan;
use crate::error::Result;
use crate::query_engine::QueryEngineContext;
@@ -25,6 +25,6 @@ pub trait QueryExecutor {
fn execute_stream(
&self,
ctx: &QueryEngineContext,
plan: &Arc<dyn PhysicalPlan>,
plan: &Arc<dyn ExecutionPlan>,
) -> Result<SendableRecordBatchStream>;
}

View File

@@ -12,11 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use common_query::DfPhysicalPlanRef;
use std::sync::Arc;
use datafusion::config::ConfigOptions;
use datafusion::physical_optimizer::PhysicalOptimizerRule;
use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
use datafusion::physical_plan::repartition::RepartitionExec;
use datafusion::physical_plan::ExecutionPlan;
use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_common::Result as DfResult;
@@ -30,9 +32,9 @@ pub struct RemoveDuplicate;
impl PhysicalOptimizerRule for RemoveDuplicate {
fn optimize(
&self,
plan: DfPhysicalPlanRef,
plan: Arc<dyn ExecutionPlan>,
_config: &ConfigOptions,
) -> DfResult<DfPhysicalPlanRef> {
) -> DfResult<Arc<dyn ExecutionPlan>> {
Self::do_optimize(plan)
}
@@ -46,7 +48,7 @@ impl PhysicalOptimizerRule for RemoveDuplicate {
}
impl RemoveDuplicate {
fn do_optimize(plan: DfPhysicalPlanRef) -> DfResult<DfPhysicalPlanRef> {
fn do_optimize(plan: Arc<dyn ExecutionPlan>) -> DfResult<Arc<dyn ExecutionPlan>> {
let result = plan
.transform_down_mut(&mut |plan| {
if plan.as_any().is::<CoalesceBatchesExec>()

View File

@@ -14,7 +14,7 @@
use std::sync::Arc;
use common_query::physical_plan::PhysicalPlan;
use datafusion::physical_plan::ExecutionPlan;
use crate::error::Result;
use crate::query_engine::QueryEngineContext;
@@ -23,6 +23,6 @@ pub trait PhysicalOptimizer {
fn optimize_physical_plan(
&self,
ctx: &mut QueryEngineContext,
plan: Arc<dyn PhysicalPlan>,
) -> Result<Arc<dyn PhysicalPlan>>;
plan: Arc<dyn ExecutionPlan>,
) -> Result<Arc<dyn ExecutionPlan>>;
}

View File

@@ -14,7 +14,7 @@
use std::sync::Arc;
use common_query::physical_plan::PhysicalPlan;
use datafusion::physical_plan::ExecutionPlan;
use crate::error::Result;
use crate::plan::LogicalPlan;
@@ -29,5 +29,5 @@ pub trait PhysicalPlanner {
&self,
ctx: &mut QueryEngineContext,
logical_plan: &LogicalPlan,
) -> Result<Arc<dyn PhysicalPlan>>;
) -> Result<Arc<dyn ExecutionPlan>>;
}

View File

@@ -14,13 +14,13 @@
use std::sync::Arc;
use common_query::physical_plan::PhysicalPlan;
use datafusion::physical_plan::ExecutionPlan;
use session::context::QueryContextRef;
/// wrap physical plan with additional layer
/// e.g: metrics retrieving layer upon physical plan
pub trait PhysicalPlanWrapper: Send + Sync + 'static {
fn wrap(&self, origin: Arc<dyn PhysicalPlan>, ctx: QueryContextRef) -> Arc<dyn PhysicalPlan>;
fn wrap(&self, origin: Arc<dyn ExecutionPlan>, ctx: QueryContextRef) -> Arc<dyn ExecutionPlan>;
}
pub type PhysicalPlanWrapperRef = Arc<dyn PhysicalPlanWrapper>;

View File

@@ -23,12 +23,11 @@ use common_function::function::FunctionRef;
use common_function::handlers::{ProcedureServiceHandlerRef, TableMutationHandlerRef};
use common_function::scalars::aggregate::AggregateFunctionMetaRef;
use common_function::state::FunctionState;
use common_query::physical_plan::SessionContext;
use common_query::prelude::ScalarUdf;
use common_telemetry::warn;
use datafusion::dataframe::DataFrame;
use datafusion::error::Result as DfResult;
use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionState};
use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionContext, SessionState};
use datafusion::execution::runtime_env::RuntimeEnv;
use datafusion::physical_optimizer::optimizer::PhysicalOptimizer;
use datafusion::physical_plan::ExecutionPlan;

View File

@@ -25,11 +25,11 @@ use std::time::Duration;
use ahash::RandomState;
use arrow::compute::{self, cast_with_options, CastOptions, SortColumn};
use arrow_schema::{DataType, Field, Schema, SchemaRef, SortOptions, TimeUnit};
use common_query::DfPhysicalPlan;
use common_recordbatch::DfSendableRecordBatchStream;
use datafusion::common::{Result as DataFusionResult, Statistics};
use datafusion::error::Result as DfResult;
use datafusion::execution::context::SessionState;
use datafusion::execution::TaskContext;
use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
use datafusion::physical_plan::udaf::create_aggregate_expr as create_aggr_udf_expr;
use datafusion::physical_plan::{
@@ -930,14 +930,14 @@ impl ExecutionPlan for RangeSelectExec {
&self.cache
}
fn children(&self) -> Vec<Arc<dyn DfPhysicalPlan>> {
fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
vec![self.input.clone()]
}
fn with_new_children(
self: Arc<Self>,
children: Vec<Arc<dyn DfPhysicalPlan>>,
) -> datafusion_common::Result<Arc<dyn DfPhysicalPlan>> {
children: Vec<Arc<dyn ExecutionPlan>>,
) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
assert!(!children.is_empty());
Ok(Arc::new(Self {
input: children[0].clone(),
@@ -958,7 +958,7 @@ impl ExecutionPlan for RangeSelectExec {
fn execute(
&self,
partition: usize,
context: Arc<common_query::physical_plan::TaskContext>,
context: Arc<TaskContext>,
) -> DfResult<DfSendableRecordBatchStream> {
let baseline_metric = BaselineMetrics::new(&self.metric, partition);
let input = self.input.execute(partition, context)?;