chore: upgrade DataFusion family (#7558)

* chore: upgrade DataFusion family

Signed-off-by: luofucong <luofc@foxmail.com>

* use main proto

Signed-off-by: luofucong <luofc@foxmail.com>

* fix ci

Signed-off-by: luofucong <luofc@foxmail.com>

---------

Signed-off-by: luofucong <luofc@foxmail.com>
This commit is contained in:
LFC
2026-01-14 22:02:31 +08:00
committed by GitHub
parent a5cb0116a2
commit e64c31e59a
96 changed files with 2003 additions and 1531 deletions

View File

@@ -255,9 +255,9 @@ fn metrics_to_string(metrics: RecordBatchMetrics, format: AnalyzeFormat) -> DfRe
match format {
AnalyzeFormat::JSON => Ok(JsonMetrics::from_record_batch_metrics(metrics).to_string()),
AnalyzeFormat::TEXT => Ok(metrics.to_string()),
AnalyzeFormat::GRAPHVIZ => Err(DataFusionError::NotImplemented(
"GRAPHVIZ format is not supported for metrics output".to_string(),
)),
format => Err(DataFusionError::NotImplemented(format!(
"AnalyzeFormat {format}",
))),
}
}

View File

@@ -316,18 +316,15 @@ impl DatafusionQueryEngine {
return state
.create_physical_plan(logical_plan)
.await
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu);
.map_err(Into::into);
}
// analyze first
let analyzed_plan = state
.analyzer()
.execute_and_check(logical_plan.clone(), state.config_options(), |_, _| {})
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let analyzed_plan = state.analyzer().execute_and_check(
logical_plan.clone(),
state.config_options(),
|_, _| {},
)?;
logger.after_analyze = Some(analyzed_plan.clone());
@@ -341,10 +338,7 @@ impl DatafusionQueryEngine {
} else {
state
.optimizer()
.optimize(analyzed_plan, state, |_, _| {})
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?
.optimize(analyzed_plan, state, |_, _| {})?
};
common_telemetry::debug!("Create physical plan, optimized plan: {optimized_plan}");
@@ -371,19 +365,10 @@ impl DatafusionQueryEngine {
// Optimized by extension rules
let optimized_plan = self
.state
.optimize_by_extension_rules(plan.clone(), context)
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
.optimize_by_extension_rules(plan.clone(), context)?;
// Optimized by datafusion optimizer
let optimized_plan = self
.state
.session_state()
.optimize(&optimized_plan)
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let optimized_plan = self.state.session_state().optimize(&optimized_plan)?;
Ok(optimized_plan)
}
@@ -516,11 +501,7 @@ impl QueryEngine for DatafusionQueryEngine {
}
fn read_table(&self, table: TableRef) -> Result<DataFrame> {
self.state
.read_table(table)
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)
self.state.read_table(table).map_err(Into::into)
}
fn engine_context(&self, query_ctx: QueryContextRef) -> QueryEngineContext {
@@ -543,7 +524,8 @@ impl QueryEngine for DatafusionQueryEngine {
}
// configure execution options
state.config_mut().options_mut().execution.time_zone = query_ctx.timezone().to_string();
state.config_mut().options_mut().execution.time_zone =
Some(query_ctx.timezone().to_string());
// usually it's impossible to have both `set variable` set by sql client and
// hint in header by grpc client, so only need to deal with them separately
@@ -619,11 +601,7 @@ impl QueryExecutor for DatafusionQueryEngine {
Ok(Box::pin(EmptyRecordBatchStream::new(schema)))
}
1 => {
let df_stream = plan
.execute(0, task_ctx)
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let df_stream = plan.execute(0, task_ctx)?;
let mut stream = RecordBatchStreamAdapter::try_new_with_span(df_stream, span)
.context(error::ConvertDfRecordBatchStreamSnafu)
.map_err(BoxedError::new)
@@ -652,11 +630,7 @@ impl QueryExecutor for DatafusionQueryEngine {
.output_partitioning()
.partition_count()
);
let df_stream = merged_plan
.execute(0, task_ctx)
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let df_stream = merged_plan.execute(0, task_ctx)?;
let mut stream = RecordBatchStreamAdapter::try_new_with_span(df_stream, span)
.context(error::ConvertDfRecordBatchStreamSnafu)
.map_err(BoxedError::new)

View File

@@ -25,7 +25,7 @@ use snafu::{Location, Snafu};
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum InnerError {
#[snafu(display("DataFusion error"))]
#[snafu(transparent)]
Datafusion {
#[snafu(source)]
error: DataFusionError,

View File

@@ -1170,7 +1170,7 @@ fn test_simplify_select_now_expression() {
let expected = [
"Projection: now()",
" MergeScan [is_placeholder=false, remote_input=[",
r#"Projection: TimestampNanosecond(<TIME>, Some("+00:00")) AS now()"#,
r#"Projection: TimestampNanosecond(<TIME>, None) AS now()"#,
" TableScan: t",
"]]",
]

View File

@@ -143,7 +143,7 @@ mod tests {
let plan = create_test_plan_with_project(proj);
let result = StringNormalizationRule.analyze(plan, config).unwrap();
let expected = format!(
"Projection: CAST(Utf8(\"2017-07-23 13:10:11\") AS Timestamp({:#?}, None))\n TableScan: t",
"Projection: CAST(Utf8(\"2017-07-23 13:10:11\") AS Timestamp({}))\n TableScan: t",
time_unit
);
assert_eq!(expected, result.to_string());
@@ -162,7 +162,7 @@ mod tests {
.analyze(int_to_timestamp_plan, config)
.unwrap();
let expected = String::from(
"Projection: CAST(Int64(158412331400600000) AS Timestamp(Nanosecond, None))\n TableScan: t",
"Projection: CAST(Int64(158412331400600000) AS Timestamp(ns))\n TableScan: t",
);
assert_eq!(expected, result.to_string());

View File

@@ -4687,11 +4687,11 @@ mod test {
assert_eq!(
plan.display_indent_schema().to_string(),
"PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-1000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(Millisecond, None)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-1000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
);
let plan = PromPlanner::stmt_to_plan(
DfTableSourceProvider::new(
@@ -4717,14 +4717,14 @@ mod test {
assert_eq!(
plan.display_indent_schema().to_string(),
"Filter: prom_avg_over_time(timestamp_range,field) IS NOT NULL [timestamp:Timestamp(Millisecond, None), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
\n Projection: metrics.timestamp, prom_avg_over_time(timestamp_range, field) AS prom_avg_over_time(timestamp_range,field), metrics.tag [timestamp:Timestamp(Millisecond, None), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
\n PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[5000], time index=[timestamp], values=[\"field\"] [field:Dictionary(Int64, Float64);N, tag:Utf8, timestamp:Timestamp(Millisecond, None), timestamp_range:Dictionary(Int64, Timestamp(Millisecond, None))]\
\n PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-6000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(Millisecond, None)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
\n Projection: metrics.timestamp, prom_avg_over_time(timestamp_range, field) AS prom_avg_over_time(timestamp_range,field), metrics.tag [timestamp:Timestamp(Millisecond, None), prom_avg_over_time(timestamp_range,field):Float64;N, tag:Utf8]\
\n PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[5000], time index=[timestamp], values=[\"field\"] [field:Dictionary(Int64, Float64);N, tag:Utf8, timestamp:Timestamp(Millisecond, None), timestamp_range:Dictionary(Int64, Timestamp(Millisecond, None))]\
\n PromSeriesNormalize: offset=[0], time index=[timestamp], filter NaN: [true] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n PromSeriesDivide: tags=[\"tag\"] [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Sort: metrics.tag ASC NULLS FIRST, metrics.timestamp ASC NULLS FIRST [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Filter: metrics.tag = Utf8(\"1\") AND metrics.timestamp >= TimestampMillisecond(-6000, None) AND metrics.timestamp <= TimestampMillisecond(100001000, None) [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n Projection: metrics.field, metrics.tag, CAST(metrics.timestamp AS Timestamp(ms)) AS timestamp [field:Float64;N, tag:Utf8, timestamp:Timestamp(Millisecond, None)]\
\n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]"
);
}