1use std::collections::HashMap;
16use std::fmt;
17use std::num::NonZeroUsize;
18use std::sync::{Arc, RwLock};
19
20use async_trait::async_trait;
21use catalog::CatalogManagerRef;
22use common_base::Plugins;
23use common_function::aggrs::aggr_wrapper::fix_order::FixStateUdafOrderingAnalyzer;
24use common_function::function_factory::ScalarFunctionFactory;
25use common_function::function_registry::FUNCTION_REGISTRY;
26use common_function::handlers::{
27 FlowServiceHandlerRef, ProcedureServiceHandlerRef, TableMutationHandlerRef,
28};
29use common_function::state::FunctionState;
30use common_stat::get_total_memory_bytes;
31use common_telemetry::warn;
32use datafusion::catalog::TableFunction;
33use datafusion::dataframe::DataFrame;
34use datafusion::error::Result as DfResult;
35use datafusion::execution::SessionStateBuilder;
36use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionContext, SessionState};
37use datafusion::execution::memory_pool::{
38 GreedyMemoryPool, MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation,
39 TrackConsumersPool,
40};
41use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder};
42use datafusion::physical_optimizer::PhysicalOptimizerRule;
43use datafusion::physical_optimizer::optimizer::PhysicalOptimizer;
44use datafusion::physical_optimizer::sanity_checker::SanityCheckPlan;
45use datafusion::physical_plan::ExecutionPlan;
46use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner};
47use datafusion_expr::{AggregateUDF, LogicalPlan as DfLogicalPlan, WindowUDF};
48use datafusion_optimizer::Analyzer;
49use datafusion_optimizer::analyzer::function_rewrite::ApplyFunctionRewrites;
50use datafusion_optimizer::optimizer::Optimizer;
51use partition::manager::PartitionRuleManagerRef;
52use promql::extension_plan::PromExtensionPlanner;
53use session::context::QueryContextRef;
54use table::TableRef;
55use table::table::adapter::DfTableProviderAdapter;
56
57use crate::QueryEngineContext;
58use crate::dist_plan::{
59 DistExtensionPlanner, DistPlannerAnalyzer, DistPlannerOptions, DynFilterRegistryManager,
60 MergeSortExtensionPlanner, RemoteDynFilterRegistryLease,
61};
62use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES};
63use crate::optimizer::ExtensionAnalyzerRule;
64use crate::optimizer::const_normalization::ConstNormalizationRule;
65use crate::optimizer::constant_term::MatchesConstantTermOptimizer;
66use crate::optimizer::count_nest_aggr::CountNestAggrRule;
67use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
68use crate::optimizer::json_type_concretize::JsonTypeConcretizeRule;
69use crate::optimizer::parallelize_scan::ParallelizeScan;
70use crate::optimizer::pass_distribution::PassDistribution;
71use crate::optimizer::promql_tsid_narrow_join::PromqlTsidNarrowJoin;
72use crate::optimizer::remove_duplicate::RemoveDuplicate;
73use crate::optimizer::scan_hint::ScanHintRule;
74use crate::optimizer::string_normalization::StringNormalizationRule;
75use crate::optimizer::transcribe_atat::TranscribeAtatRule;
76use crate::optimizer::type_conversion::TypeConversionRule;
77use crate::optimizer::windowed_sort::WindowedSortPhysicalRule;
78use crate::options::QueryOptions as QueryOptionsNew;
79use crate::query_engine::DefaultSerializer;
80use crate::query_engine::options::QueryOptions;
81use crate::range_select::planner::RangeSelectPlanner;
82use crate::region_query::RegionQueryHandlerRef;
83
84#[derive(Clone)]
86pub struct QueryEngineState {
87 df_context: SessionContext,
88 catalog_manager: CatalogManagerRef,
89 dyn_filter_registry_manager: Arc<DynFilterRegistryManager>,
90 function_state: Arc<FunctionState>,
91 scalar_functions: Arc<RwLock<HashMap<String, ScalarFunctionFactory>>>,
92 aggr_functions: Arc<RwLock<HashMap<String, AggregateUDF>>>,
93 table_functions: Arc<RwLock<HashMap<String, Arc<TableFunction>>>>,
94 extension_rules: Vec<Arc<dyn ExtensionAnalyzerRule + Send + Sync>>,
95 plugins: Plugins,
96}
97
98impl fmt::Debug for QueryEngineState {
99 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
100 f.debug_struct("QueryEngineState")
101 .field("state", &self.df_context.state())
102 .finish()
103 }
104}
105
106impl QueryEngineState {
107 #[allow(clippy::too_many_arguments)]
108 pub fn new(
109 catalog_list: CatalogManagerRef,
110 partition_rule_manager: Option<PartitionRuleManagerRef>,
111 region_query_handler: Option<RegionQueryHandlerRef>,
112 table_mutation_handler: Option<TableMutationHandlerRef>,
113 procedure_service_handler: Option<ProcedureServiceHandlerRef>,
114 flow_service_handler: Option<FlowServiceHandlerRef>,
115 with_dist_planner: bool,
116 plugins: Plugins,
117 options: QueryOptionsNew,
118 ) -> Self {
119 let total_memory = get_total_memory_bytes().max(0) as u64;
120 let memory_pool_size = options.memory_pool_size.resolve(total_memory) as usize;
121 let runtime_env = if memory_pool_size > 0 {
122 Arc::new(
123 RuntimeEnvBuilder::new()
124 .with_memory_pool(Arc::new(MetricsMemoryPool::new(memory_pool_size)))
125 .build()
126 .expect("Failed to build RuntimeEnv"),
127 )
128 } else {
129 Arc::new(RuntimeEnv::default())
130 };
131 let mut session_config = SessionConfig::new().with_create_default_catalog_and_schema(false);
132 if options.parallelism > 0 {
133 session_config = session_config.with_target_partitions(options.parallelism);
134 }
135 if options.allow_query_fallback {
136 session_config
137 .options_mut()
138 .extensions
139 .insert(DistPlannerOptions {
140 allow_query_fallback: true,
141 });
142 }
143
144 session_config
147 .options_mut()
148 .execution
149 .skip_physical_aggregate_schema_check = true;
150
151 let mut extension_rules = Vec::new();
153
154 extension_rules.insert(0, Arc::new(TypeConversionRule) as _);
156 extension_rules.push(Arc::new(CountNestAggrRule) as _);
157
158 let mut analyzer = Analyzer::new();
160 analyzer.rules.insert(0, Arc::new(TranscribeAtatRule));
161 analyzer.rules.insert(0, Arc::new(StringNormalizationRule));
162 analyzer
163 .rules
164 .insert(0, Arc::new(CountWildcardToTimeIndexRule));
165 analyzer.rules.push(Arc::new(ConstNormalizationRule));
166
167 analyzer.rules.insert(
171 0,
172 Arc::new(ApplyFunctionRewrites::new(
173 FUNCTION_REGISTRY.function_rewrites(),
174 )),
175 );
176
177 if with_dist_planner {
178 analyzer.rules.push(Arc::new(DistPlannerAnalyzer));
179 }
180 analyzer.rules.push(Arc::new(FixStateUdafOrderingAnalyzer));
181
182 let mut optimizer = Optimizer::new();
183 optimizer.rules.push(Arc::new(ScanHintRule));
184 optimizer.rules.push(Arc::new(JsonTypeConcretizeRule));
185
186 let mut physical_optimizer = PhysicalOptimizer::new();
188 physical_optimizer
190 .rules
191 .insert(5, Arc::new(ParallelizeScan));
192 physical_optimizer
194 .rules
195 .insert(6, Arc::new(PassDistribution));
196 physical_optimizer
198 .rules
199 .insert(7, Arc::new(PromqlTsidNarrowJoin));
200 physical_optimizer.rules.insert(
202 8,
203 Arc::new(datafusion::physical_optimizer::enforce_sorting::EnforceSorting {}),
204 );
205 physical_optimizer
207 .rules
208 .push(Arc::new(WindowedSortPhysicalRule));
209 physical_optimizer
214 .rules
215 .push(Arc::new(MatchesConstantTermOptimizer));
216 physical_optimizer.rules.push(Arc::new(RemoveDuplicate));
218 Self::remove_physical_optimizer_rule(
220 &mut physical_optimizer.rules,
221 SanityCheckPlan {}.name(),
222 );
223 physical_optimizer.rules.push(Arc::new(SanityCheckPlan {}));
224
225 let session_state = SessionStateBuilder::new()
226 .with_config(session_config)
227 .with_runtime_env(runtime_env)
228 .with_default_features()
229 .with_analyzer_rules(analyzer.rules)
230 .with_serializer_registry(Arc::new(DefaultSerializer))
231 .with_query_planner(Arc::new(DfQueryPlanner::new(
232 catalog_list.clone(),
233 partition_rule_manager,
234 region_query_handler.clone(),
235 )))
236 .with_optimizer_rules(optimizer.rules)
237 .with_physical_optimizer_rules(physical_optimizer.rules)
238 .build();
239
240 let df_context = SessionContext::new_with_state(session_state);
241 register_function_aliases(&df_context);
242
243 Self {
244 df_context,
245 catalog_manager: catalog_list,
246 dyn_filter_registry_manager: Arc::new(DynFilterRegistryManager::default()),
247 function_state: Arc::new(FunctionState {
248 table_mutation_handler,
249 procedure_service_handler,
250 flow_service_handler,
251 }),
252 aggr_functions: Arc::new(RwLock::new(HashMap::new())),
253 table_functions: Arc::new(RwLock::new(HashMap::new())),
254 extension_rules,
255 plugins,
256 scalar_functions: Arc::new(RwLock::new(HashMap::new())),
257 }
258 }
259
260 fn remove_physical_optimizer_rule(
261 rules: &mut Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>>,
262 name: &str,
263 ) {
264 rules.retain(|rule| rule.name() != name);
265 }
266
267 pub fn optimize_by_extension_rules(
269 &self,
270 plan: DfLogicalPlan,
271 context: &QueryEngineContext,
272 ) -> DfResult<DfLogicalPlan> {
273 self.extension_rules
274 .iter()
275 .try_fold(plan, |acc_plan, rule| {
276 rule.analyze(acc_plan, context, self.session_state().config_options())
277 })
278 }
279
280 pub fn optimize_logical_plan(&self, plan: DfLogicalPlan) -> DfResult<DfLogicalPlan> {
282 self.session_state().optimize(&plan)
283 }
284
285 pub fn scalar_function(&self, function_name: &str) -> Option<ScalarFunctionFactory> {
287 self.scalar_functions
288 .read()
289 .unwrap()
290 .get(function_name)
291 .cloned()
292 }
293
294 pub fn scalar_names(&self) -> Vec<String> {
296 self.scalar_functions
297 .read()
298 .unwrap()
299 .keys()
300 .cloned()
301 .collect()
302 }
303
304 pub fn aggr_function(&self, function_name: &str) -> Option<AggregateUDF> {
306 self.aggr_functions
307 .read()
308 .unwrap()
309 .get(function_name)
310 .cloned()
311 }
312
313 pub fn aggr_names(&self) -> Vec<String> {
315 self.aggr_functions
316 .read()
317 .unwrap()
318 .keys()
319 .cloned()
320 .collect()
321 }
322
323 pub fn table_function(&self, function_name: &str) -> Option<Arc<TableFunction>> {
325 self.table_functions
326 .read()
327 .unwrap()
328 .get(function_name)
329 .cloned()
330 }
331
332 pub fn table_function_names(&self) -> Vec<String> {
334 self.table_functions
335 .read()
336 .unwrap()
337 .keys()
338 .cloned()
339 .collect()
340 }
341
342 pub fn register_scalar_function(&self, func: ScalarFunctionFactory) {
345 let name = func.name().to_string();
346 let x = self
347 .scalar_functions
348 .write()
349 .unwrap()
350 .insert(name.clone(), func);
351
352 if x.is_some() {
353 warn!("Already registered scalar function '{name}'");
354 }
355 }
356
357 pub fn register_aggr_function(&self, func: AggregateUDF) {
366 let name = func.name().to_string();
367 let x = self
368 .aggr_functions
369 .write()
370 .unwrap()
371 .insert(name.clone(), func);
372 assert!(
373 x.is_none(),
374 "Already registered aggregate function '{name}'"
375 );
376 }
377
378 pub fn register_table_function(&self, func: Arc<TableFunction>) {
379 let name = func.name();
380 let x = self
381 .table_functions
382 .write()
383 .unwrap()
384 .insert(name.to_string(), func.clone());
385
386 if x.is_some() {
387 warn!("Already registered table function '{name}'");
388 }
389 }
390
391 pub fn register_window_function(&self, func: WindowUDF) {
396 self.df_context.register_udwf(func);
397 }
398
399 pub fn catalog_manager(&self) -> &CatalogManagerRef {
400 &self.catalog_manager
401 }
402
403 pub fn dyn_filter_registry_manager(&self) -> Arc<DynFilterRegistryManager> {
404 self.dyn_filter_registry_manager.clone()
405 }
406
407 pub fn acquire_remote_dyn_filter_registry_lease(
408 &self,
409 query_ctx: &QueryContextRef,
410 ) -> Option<RemoteDynFilterRegistryLease> {
411 let query_id = query_ctx.remote_query_id_value()?;
412 Some(
413 self.dyn_filter_registry_manager
414 .clone()
415 .acquire_lease(query_id),
416 )
417 }
418
419 pub fn function_state(&self) -> Arc<FunctionState> {
420 self.function_state.clone()
421 }
422
423 pub fn table_mutation_handler(&self) -> Option<&TableMutationHandlerRef> {
425 self.function_state.table_mutation_handler.as_ref()
426 }
427
428 pub fn procedure_service_handler(&self) -> Option<&ProcedureServiceHandlerRef> {
430 self.function_state.procedure_service_handler.as_ref()
431 }
432
433 pub(crate) fn disallow_cross_catalog_query(&self) -> bool {
434 self.plugins
435 .map::<QueryOptions, _, _>(|x| x.disallow_cross_catalog_query)
436 .unwrap_or(false)
437 }
438
439 pub fn session_state(&self) -> SessionState {
440 self.df_context.state()
441 }
442
443 pub fn read_table(&self, table: TableRef) -> DfResult<DataFrame> {
445 self.df_context
446 .read_table(Arc::new(DfTableProviderAdapter::new(table)))
447 }
448}
449
450struct DfQueryPlanner {
451 physical_planner: DefaultPhysicalPlanner,
452}
453
454impl fmt::Debug for DfQueryPlanner {
455 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
456 f.debug_struct("DfQueryPlanner").finish()
457 }
458}
459
460#[async_trait]
461impl QueryPlanner for DfQueryPlanner {
462 async fn create_physical_plan(
463 &self,
464 logical_plan: &DfLogicalPlan,
465 session_state: &SessionState,
466 ) -> DfResult<Arc<dyn ExecutionPlan>> {
467 self.physical_planner
468 .create_physical_plan(logical_plan, session_state)
469 .await
470 }
471}
472
473const SCALAR_FUNCTION_ALIASES: &[(&str, &str)] = &[
475 ("upper", "ucase"),
476 ("lower", "lcase"),
477 ("ceil", "ceiling"),
478 ("substr", "mid"),
479 ("random", "rand"),
480];
481
482const AGGREGATE_FUNCTION_ALIASES: &[(&str, &str)] =
484 &[("stddev_pop", "std"), ("var_pop", "variance")];
485
486fn register_function_aliases(ctx: &SessionContext) {
491 let state = ctx.state();
492
493 for (target, alias) in SCALAR_FUNCTION_ALIASES {
494 if let Some(func) = state.scalar_functions().get(*target) {
495 let aliased = func.as_ref().clone().with_aliases([*alias]);
496 ctx.register_udf(aliased);
497 }
498 }
499
500 for (target, alias) in AGGREGATE_FUNCTION_ALIASES {
501 if let Some(func) = state.aggregate_functions().get(*target) {
502 let aliased = func.as_ref().clone().with_aliases([*alias]);
503 ctx.register_udaf(aliased);
504 }
505 }
506}
507
508impl DfQueryPlanner {
509 fn new(
510 catalog_manager: CatalogManagerRef,
511 partition_rule_manager: Option<PartitionRuleManagerRef>,
512 region_query_handler: Option<RegionQueryHandlerRef>,
513 ) -> Self {
514 let mut planners: Vec<Arc<dyn ExtensionPlanner + Send + Sync>> =
515 vec![Arc::new(PromExtensionPlanner), Arc::new(RangeSelectPlanner)];
516 if let (Some(region_query_handler), Some(partition_rule_manager)) =
517 (region_query_handler, partition_rule_manager)
518 {
519 planners.push(Arc::new(DistExtensionPlanner::new(
520 catalog_manager,
521 partition_rule_manager,
522 region_query_handler,
523 )));
524 planners.push(Arc::new(MergeSortExtensionPlanner {}));
525 }
526 Self {
527 physical_planner: DefaultPhysicalPlanner::with_extension_planners(planners),
528 }
529 }
530}
531
532#[derive(Debug)]
537struct MetricsMemoryPool {
538 inner: Arc<TrackConsumersPool<GreedyMemoryPool>>,
539}
540
541impl MetricsMemoryPool {
542 const TOP_CONSUMERS_TO_REPORT: usize = 5;
544
545 fn new(limit: usize) -> Self {
546 Self {
547 inner: Arc::new(TrackConsumersPool::new(
548 GreedyMemoryPool::new(limit),
549 NonZeroUsize::new(Self::TOP_CONSUMERS_TO_REPORT).unwrap(),
550 )),
551 }
552 }
553
554 #[inline]
555 fn update_metrics(&self) {
556 QUERY_MEMORY_POOL_USAGE_BYTES.set(self.inner.reserved() as i64);
557 }
558}
559
560impl MemoryPool for MetricsMemoryPool {
561 fn register(&self, consumer: &MemoryConsumer) {
562 self.inner.register(consumer);
563 }
564
565 fn unregister(&self, consumer: &MemoryConsumer) {
566 self.inner.unregister(consumer);
567 }
568
569 fn grow(&self, reservation: &MemoryReservation, additional: usize) {
570 self.inner.grow(reservation, additional);
571 self.update_metrics();
572 }
573
574 fn shrink(&self, reservation: &MemoryReservation, shrink: usize) {
575 self.inner.shrink(reservation, shrink);
576 self.update_metrics();
577 }
578
579 fn try_grow(
580 &self,
581 reservation: &MemoryReservation,
582 additional: usize,
583 ) -> datafusion_common::Result<()> {
584 let result = self.inner.try_grow(reservation, additional);
585 if result.is_err() {
586 QUERY_MEMORY_POOL_REJECTED_TOTAL.inc();
587 }
588 self.update_metrics();
589 result
590 }
591
592 fn reserved(&self) -> usize {
593 self.inner.reserved()
594 }
595
596 fn memory_limit(&self) -> MemoryLimit {
597 self.inner.memory_limit()
598 }
599}
600
601#[cfg(test)]
602mod tests {
603 use common_base::Plugins;
604 use session::context::QueryContext;
605
606 use super::*;
607 use crate::options::QueryOptions;
608
609 fn new_query_engine_state() -> QueryEngineState {
610 QueryEngineState::new(
611 catalog::memory::new_memory_catalog_manager().unwrap(),
612 None,
613 None,
614 None,
615 None,
616 None,
617 false,
618 Plugins::default(),
619 QueryOptions::default(),
620 )
621 }
622
623 #[test]
624 fn query_engine_state_reuses_query_scoped_dyn_filter_registry_lease() {
625 let state = new_query_engine_state();
626 let query_ctx = QueryContext::arc();
627
628 let first = state
629 .acquire_remote_dyn_filter_registry_lease(&query_ctx)
630 .unwrap();
631 let second = state
632 .acquire_remote_dyn_filter_registry_lease(&query_ctx)
633 .unwrap();
634
635 assert!(first.ptr_eq(&second));
636 assert_eq!(state.dyn_filter_registry_manager().registry_count(), 1);
637 assert_eq!(
638 first.registry().query_id(),
639 query_ctx.remote_query_id_value().unwrap()
640 );
641 }
642
643 #[test]
644 fn query_engine_state_relies_on_query_context_remote_query_id_contract() {
645 let state = new_query_engine_state();
646 let query_ctx = QueryContext::arc();
647
648 assert!(query_ctx.remote_query_id_value().is_some());
649
650 let lease = state
651 .acquire_remote_dyn_filter_registry_lease(&query_ctx)
652 .unwrap();
653
654 assert_eq!(
655 lease.registry().query_id(),
656 query_ctx.remote_query_id_value().unwrap()
657 );
658 assert_eq!(state.dyn_filter_registry_manager().registry_count(), 1);
659 }
660
661 #[test]
662 fn query_engine_state_separates_registries_for_different_query_contexts() {
663 let state = new_query_engine_state();
664 let first_query_ctx = QueryContext::arc();
665 let second_query_ctx = QueryContext::arc();
666
667 let first = state
668 .acquire_remote_dyn_filter_registry_lease(&first_query_ctx)
669 .unwrap();
670 let second = state
671 .acquire_remote_dyn_filter_registry_lease(&second_query_ctx)
672 .unwrap();
673
674 assert!(!first.ptr_eq(&second));
675 assert_eq!(state.dyn_filter_registry_manager().registry_count(), 2);
676 assert_eq!(
677 first.registry().query_id(),
678 first_query_ctx.remote_query_id_value().unwrap()
679 );
680 assert_eq!(
681 second.registry().query_id(),
682 second_query_ctx.remote_query_id_value().unwrap()
683 );
684 }
685}