1use std::collections::HashMap;
16use std::fmt;
17use std::num::NonZeroUsize;
18use std::sync::{Arc, RwLock};
19
20use async_trait::async_trait;
21use catalog::CatalogManagerRef;
22use common_base::Plugins;
23use common_function::aggrs::aggr_wrapper::fix_order::FixStateUdafOrderingAnalyzer;
24use common_function::function_factory::ScalarFunctionFactory;
25use common_function::function_registry::FUNCTION_REGISTRY;
26use common_function::handlers::{
27 FlowServiceHandlerRef, ProcedureServiceHandlerRef, TableMutationHandlerRef,
28};
29use common_function::state::FunctionState;
30use common_stat::get_total_memory_bytes;
31use common_telemetry::warn;
32use datafusion::catalog::TableFunction;
33use datafusion::dataframe::DataFrame;
34use datafusion::error::Result as DfResult;
35use datafusion::execution::SessionStateBuilder;
36use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionContext, SessionState};
37use datafusion::execution::memory_pool::{
38 GreedyMemoryPool, MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation,
39 TrackConsumersPool,
40};
41use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder};
42use datafusion::physical_optimizer::PhysicalOptimizerRule;
43use datafusion::physical_optimizer::optimizer::PhysicalOptimizer;
44use datafusion::physical_optimizer::sanity_checker::SanityCheckPlan;
45use datafusion::physical_plan::ExecutionPlan;
46use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner};
47use datafusion_expr::{AggregateUDF, LogicalPlan as DfLogicalPlan, WindowUDF};
48use datafusion_optimizer::Analyzer;
49use datafusion_optimizer::analyzer::function_rewrite::ApplyFunctionRewrites;
50use datafusion_optimizer::optimizer::Optimizer;
51use partition::manager::PartitionRuleManagerRef;
52use promql::extension_plan::PromExtensionPlanner;
53use table::TableRef;
54use table::table::adapter::DfTableProviderAdapter;
55
56use crate::QueryEngineContext;
57use crate::dist_plan::{
58 DistExtensionPlanner, DistPlannerAnalyzer, DistPlannerOptions, MergeSortExtensionPlanner,
59};
60use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES};
61use crate::optimizer::ExtensionAnalyzerRule;
62use crate::optimizer::const_normalization::ConstNormalizationRule;
63use crate::optimizer::constant_term::MatchesConstantTermOptimizer;
64use crate::optimizer::count_nest_aggr::CountNestAggrRule;
65use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
66use crate::optimizer::json_type_concretize::JsonTypeConcretizeRule;
67use crate::optimizer::parallelize_scan::ParallelizeScan;
68use crate::optimizer::pass_distribution::PassDistribution;
69use crate::optimizer::promql_tsid_narrow_join::PromqlTsidNarrowJoin;
70use crate::optimizer::remove_duplicate::RemoveDuplicate;
71use crate::optimizer::scan_hint::ScanHintRule;
72use crate::optimizer::string_normalization::StringNormalizationRule;
73use crate::optimizer::transcribe_atat::TranscribeAtatRule;
74use crate::optimizer::type_conversion::TypeConversionRule;
75use crate::optimizer::windowed_sort::WindowedSortPhysicalRule;
76use crate::options::QueryOptions as QueryOptionsNew;
77use crate::query_engine::DefaultSerializer;
78use crate::query_engine::options::QueryOptions;
79use crate::range_select::planner::RangeSelectPlanner;
80use crate::region_query::RegionQueryHandlerRef;
81
82#[derive(Clone)]
84pub struct QueryEngineState {
85 df_context: SessionContext,
86 catalog_manager: CatalogManagerRef,
87 function_state: Arc<FunctionState>,
88 scalar_functions: Arc<RwLock<HashMap<String, ScalarFunctionFactory>>>,
89 aggr_functions: Arc<RwLock<HashMap<String, AggregateUDF>>>,
90 table_functions: Arc<RwLock<HashMap<String, Arc<TableFunction>>>>,
91 extension_rules: Vec<Arc<dyn ExtensionAnalyzerRule + Send + Sync>>,
92 plugins: Plugins,
93}
94
95impl fmt::Debug for QueryEngineState {
96 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
97 f.debug_struct("QueryEngineState")
98 .field("state", &self.df_context.state())
99 .finish()
100 }
101}
102
103impl QueryEngineState {
104 #[allow(clippy::too_many_arguments)]
105 pub fn new(
106 catalog_list: CatalogManagerRef,
107 partition_rule_manager: Option<PartitionRuleManagerRef>,
108 region_query_handler: Option<RegionQueryHandlerRef>,
109 table_mutation_handler: Option<TableMutationHandlerRef>,
110 procedure_service_handler: Option<ProcedureServiceHandlerRef>,
111 flow_service_handler: Option<FlowServiceHandlerRef>,
112 with_dist_planner: bool,
113 plugins: Plugins,
114 options: QueryOptionsNew,
115 ) -> Self {
116 let total_memory = get_total_memory_bytes().max(0) as u64;
117 let memory_pool_size = options.memory_pool_size.resolve(total_memory) as usize;
118 let runtime_env = if memory_pool_size > 0 {
119 Arc::new(
120 RuntimeEnvBuilder::new()
121 .with_memory_pool(Arc::new(MetricsMemoryPool::new(memory_pool_size)))
122 .build()
123 .expect("Failed to build RuntimeEnv"),
124 )
125 } else {
126 Arc::new(RuntimeEnv::default())
127 };
128 let mut session_config = SessionConfig::new().with_create_default_catalog_and_schema(false);
129 if options.parallelism > 0 {
130 session_config = session_config.with_target_partitions(options.parallelism);
131 }
132 if options.allow_query_fallback {
133 session_config
134 .options_mut()
135 .extensions
136 .insert(DistPlannerOptions {
137 allow_query_fallback: true,
138 });
139 }
140
141 session_config
144 .options_mut()
145 .execution
146 .skip_physical_aggregate_schema_check = true;
147
148 let mut extension_rules = Vec::new();
150
151 extension_rules.insert(0, Arc::new(TypeConversionRule) as _);
153 extension_rules.push(Arc::new(CountNestAggrRule) as _);
154
155 let mut analyzer = Analyzer::new();
157 analyzer.rules.insert(0, Arc::new(TranscribeAtatRule));
158 analyzer.rules.insert(0, Arc::new(StringNormalizationRule));
159 analyzer
160 .rules
161 .insert(0, Arc::new(CountWildcardToTimeIndexRule));
162 analyzer.rules.push(Arc::new(ConstNormalizationRule));
163
164 analyzer.rules.insert(
168 0,
169 Arc::new(ApplyFunctionRewrites::new(
170 FUNCTION_REGISTRY.function_rewrites(),
171 )),
172 );
173
174 if with_dist_planner {
175 analyzer.rules.push(Arc::new(DistPlannerAnalyzer));
176 }
177 analyzer.rules.push(Arc::new(FixStateUdafOrderingAnalyzer));
178
179 let mut optimizer = Optimizer::new();
180 optimizer.rules.push(Arc::new(ScanHintRule));
181 optimizer.rules.push(Arc::new(JsonTypeConcretizeRule));
182
183 let mut physical_optimizer = PhysicalOptimizer::new();
185 physical_optimizer
187 .rules
188 .insert(5, Arc::new(ParallelizeScan));
189 physical_optimizer
191 .rules
192 .insert(6, Arc::new(PassDistribution));
193 physical_optimizer
195 .rules
196 .insert(7, Arc::new(PromqlTsidNarrowJoin));
197 physical_optimizer.rules.insert(
199 8,
200 Arc::new(datafusion::physical_optimizer::enforce_sorting::EnforceSorting {}),
201 );
202 physical_optimizer
204 .rules
205 .push(Arc::new(WindowedSortPhysicalRule));
206 physical_optimizer
211 .rules
212 .push(Arc::new(MatchesConstantTermOptimizer));
213 physical_optimizer.rules.push(Arc::new(RemoveDuplicate));
215 Self::remove_physical_optimizer_rule(
217 &mut physical_optimizer.rules,
218 SanityCheckPlan {}.name(),
219 );
220 physical_optimizer.rules.push(Arc::new(SanityCheckPlan {}));
221
222 let session_state = SessionStateBuilder::new()
223 .with_config(session_config)
224 .with_runtime_env(runtime_env)
225 .with_default_features()
226 .with_analyzer_rules(analyzer.rules)
227 .with_serializer_registry(Arc::new(DefaultSerializer))
228 .with_query_planner(Arc::new(DfQueryPlanner::new(
229 catalog_list.clone(),
230 partition_rule_manager,
231 region_query_handler,
232 )))
233 .with_optimizer_rules(optimizer.rules)
234 .with_physical_optimizer_rules(physical_optimizer.rules)
235 .build();
236
237 let df_context = SessionContext::new_with_state(session_state);
238 register_function_aliases(&df_context);
239
240 Self {
241 df_context,
242 catalog_manager: catalog_list,
243 function_state: Arc::new(FunctionState {
244 table_mutation_handler,
245 procedure_service_handler,
246 flow_service_handler,
247 }),
248 aggr_functions: Arc::new(RwLock::new(HashMap::new())),
249 table_functions: Arc::new(RwLock::new(HashMap::new())),
250 extension_rules,
251 plugins,
252 scalar_functions: Arc::new(RwLock::new(HashMap::new())),
253 }
254 }
255
256 fn remove_physical_optimizer_rule(
257 rules: &mut Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>>,
258 name: &str,
259 ) {
260 rules.retain(|rule| rule.name() != name);
261 }
262
263 pub fn optimize_by_extension_rules(
265 &self,
266 plan: DfLogicalPlan,
267 context: &QueryEngineContext,
268 ) -> DfResult<DfLogicalPlan> {
269 self.extension_rules
270 .iter()
271 .try_fold(plan, |acc_plan, rule| {
272 rule.analyze(acc_plan, context, self.session_state().config_options())
273 })
274 }
275
276 pub fn optimize_logical_plan(&self, plan: DfLogicalPlan) -> DfResult<DfLogicalPlan> {
278 self.session_state().optimize(&plan)
279 }
280
281 pub fn scalar_function(&self, function_name: &str) -> Option<ScalarFunctionFactory> {
283 self.scalar_functions
284 .read()
285 .unwrap()
286 .get(function_name)
287 .cloned()
288 }
289
290 pub fn scalar_names(&self) -> Vec<String> {
292 self.scalar_functions
293 .read()
294 .unwrap()
295 .keys()
296 .cloned()
297 .collect()
298 }
299
300 pub fn aggr_function(&self, function_name: &str) -> Option<AggregateUDF> {
302 self.aggr_functions
303 .read()
304 .unwrap()
305 .get(function_name)
306 .cloned()
307 }
308
309 pub fn aggr_names(&self) -> Vec<String> {
311 self.aggr_functions
312 .read()
313 .unwrap()
314 .keys()
315 .cloned()
316 .collect()
317 }
318
319 pub fn table_function(&self, function_name: &str) -> Option<Arc<TableFunction>> {
321 self.table_functions
322 .read()
323 .unwrap()
324 .get(function_name)
325 .cloned()
326 }
327
328 pub fn table_function_names(&self) -> Vec<String> {
330 self.table_functions
331 .read()
332 .unwrap()
333 .keys()
334 .cloned()
335 .collect()
336 }
337
338 pub fn register_scalar_function(&self, func: ScalarFunctionFactory) {
341 let name = func.name().to_string();
342 let x = self
343 .scalar_functions
344 .write()
345 .unwrap()
346 .insert(name.clone(), func);
347
348 if x.is_some() {
349 warn!("Already registered scalar function '{name}'");
350 }
351 }
352
353 pub fn register_aggr_function(&self, func: AggregateUDF) {
362 let name = func.name().to_string();
363 let x = self
364 .aggr_functions
365 .write()
366 .unwrap()
367 .insert(name.clone(), func);
368 assert!(
369 x.is_none(),
370 "Already registered aggregate function '{name}'"
371 );
372 }
373
374 pub fn register_table_function(&self, func: Arc<TableFunction>) {
375 let name = func.name();
376 let x = self
377 .table_functions
378 .write()
379 .unwrap()
380 .insert(name.to_string(), func.clone());
381
382 if x.is_some() {
383 warn!("Already registered table function '{name}'");
384 }
385 }
386
387 pub fn register_window_function(&self, func: WindowUDF) {
392 self.df_context.register_udwf(func);
393 }
394
395 pub fn catalog_manager(&self) -> &CatalogManagerRef {
396 &self.catalog_manager
397 }
398
399 pub fn function_state(&self) -> Arc<FunctionState> {
400 self.function_state.clone()
401 }
402
403 pub fn table_mutation_handler(&self) -> Option<&TableMutationHandlerRef> {
405 self.function_state.table_mutation_handler.as_ref()
406 }
407
408 pub fn procedure_service_handler(&self) -> Option<&ProcedureServiceHandlerRef> {
410 self.function_state.procedure_service_handler.as_ref()
411 }
412
413 pub(crate) fn disallow_cross_catalog_query(&self) -> bool {
414 self.plugins
415 .map::<QueryOptions, _, _>(|x| x.disallow_cross_catalog_query)
416 .unwrap_or(false)
417 }
418
419 pub fn session_state(&self) -> SessionState {
420 self.df_context.state()
421 }
422
423 pub fn read_table(&self, table: TableRef) -> DfResult<DataFrame> {
425 self.df_context
426 .read_table(Arc::new(DfTableProviderAdapter::new(table)))
427 }
428}
429
430struct DfQueryPlanner {
431 physical_planner: DefaultPhysicalPlanner,
432}
433
434impl fmt::Debug for DfQueryPlanner {
435 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
436 f.debug_struct("DfQueryPlanner").finish()
437 }
438}
439
440#[async_trait]
441impl QueryPlanner for DfQueryPlanner {
442 async fn create_physical_plan(
443 &self,
444 logical_plan: &DfLogicalPlan,
445 session_state: &SessionState,
446 ) -> DfResult<Arc<dyn ExecutionPlan>> {
447 self.physical_planner
448 .create_physical_plan(logical_plan, session_state)
449 .await
450 }
451}
452
453const SCALAR_FUNCTION_ALIASES: &[(&str, &str)] = &[
455 ("upper", "ucase"),
456 ("lower", "lcase"),
457 ("ceil", "ceiling"),
458 ("substr", "mid"),
459 ("random", "rand"),
460];
461
462const AGGREGATE_FUNCTION_ALIASES: &[(&str, &str)] =
464 &[("stddev_pop", "std"), ("var_pop", "variance")];
465
466fn register_function_aliases(ctx: &SessionContext) {
471 let state = ctx.state();
472
473 for (target, alias) in SCALAR_FUNCTION_ALIASES {
474 if let Some(func) = state.scalar_functions().get(*target) {
475 let aliased = func.as_ref().clone().with_aliases([*alias]);
476 ctx.register_udf(aliased);
477 }
478 }
479
480 for (target, alias) in AGGREGATE_FUNCTION_ALIASES {
481 if let Some(func) = state.aggregate_functions().get(*target) {
482 let aliased = func.as_ref().clone().with_aliases([*alias]);
483 ctx.register_udaf(aliased);
484 }
485 }
486}
487
488impl DfQueryPlanner {
489 fn new(
490 catalog_manager: CatalogManagerRef,
491 partition_rule_manager: Option<PartitionRuleManagerRef>,
492 region_query_handler: Option<RegionQueryHandlerRef>,
493 ) -> Self {
494 let mut planners: Vec<Arc<dyn ExtensionPlanner + Send + Sync>> =
495 vec![Arc::new(PromExtensionPlanner), Arc::new(RangeSelectPlanner)];
496 if let (Some(region_query_handler), Some(partition_rule_manager)) =
497 (region_query_handler, partition_rule_manager)
498 {
499 planners.push(Arc::new(DistExtensionPlanner::new(
500 catalog_manager,
501 partition_rule_manager,
502 region_query_handler,
503 )));
504 planners.push(Arc::new(MergeSortExtensionPlanner {}));
505 }
506 Self {
507 physical_planner: DefaultPhysicalPlanner::with_extension_planners(planners),
508 }
509 }
510}
511
512#[derive(Debug)]
517struct MetricsMemoryPool {
518 inner: Arc<TrackConsumersPool<GreedyMemoryPool>>,
519}
520
521impl MetricsMemoryPool {
522 const TOP_CONSUMERS_TO_REPORT: usize = 5;
524
525 fn new(limit: usize) -> Self {
526 Self {
527 inner: Arc::new(TrackConsumersPool::new(
528 GreedyMemoryPool::new(limit),
529 NonZeroUsize::new(Self::TOP_CONSUMERS_TO_REPORT).unwrap(),
530 )),
531 }
532 }
533
534 #[inline]
535 fn update_metrics(&self) {
536 QUERY_MEMORY_POOL_USAGE_BYTES.set(self.inner.reserved() as i64);
537 }
538}
539
540impl MemoryPool for MetricsMemoryPool {
541 fn register(&self, consumer: &MemoryConsumer) {
542 self.inner.register(consumer);
543 }
544
545 fn unregister(&self, consumer: &MemoryConsumer) {
546 self.inner.unregister(consumer);
547 }
548
549 fn grow(&self, reservation: &MemoryReservation, additional: usize) {
550 self.inner.grow(reservation, additional);
551 self.update_metrics();
552 }
553
554 fn shrink(&self, reservation: &MemoryReservation, shrink: usize) {
555 self.inner.shrink(reservation, shrink);
556 self.update_metrics();
557 }
558
559 fn try_grow(
560 &self,
561 reservation: &MemoryReservation,
562 additional: usize,
563 ) -> datafusion_common::Result<()> {
564 let result = self.inner.try_grow(reservation, additional);
565 if result.is_err() {
566 QUERY_MEMORY_POOL_REJECTED_TOTAL.inc();
567 }
568 self.update_metrics();
569 result
570 }
571
572 fn reserved(&self) -> usize {
573 self.inner.reserved()
574 }
575
576 fn memory_limit(&self) -> MemoryLimit {
577 self.inner.memory_limit()
578 }
579}