Skip to main content

query/query_engine/
state.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::HashMap;
16use std::fmt;
17use std::num::NonZeroUsize;
18use std::sync::{Arc, RwLock};
19
20use async_trait::async_trait;
21use catalog::CatalogManagerRef;
22use common_base::Plugins;
23use common_function::aggrs::aggr_wrapper::fix_order::FixStateUdafOrderingAnalyzer;
24use common_function::function_factory::ScalarFunctionFactory;
25use common_function::function_registry::FUNCTION_REGISTRY;
26use common_function::handlers::{
27    FlowServiceHandlerRef, ProcedureServiceHandlerRef, TableMutationHandlerRef,
28};
29use common_function::state::FunctionState;
30use common_stat::get_total_memory_bytes;
31use common_telemetry::warn;
32use datafusion::catalog::TableFunction;
33use datafusion::dataframe::DataFrame;
34use datafusion::error::Result as DfResult;
35use datafusion::execution::SessionStateBuilder;
36use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionContext, SessionState};
37use datafusion::execution::memory_pool::{
38    GreedyMemoryPool, MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation,
39    TrackConsumersPool,
40};
41use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder};
42use datafusion::physical_optimizer::PhysicalOptimizerRule;
43use datafusion::physical_optimizer::optimizer::PhysicalOptimizer;
44use datafusion::physical_optimizer::sanity_checker::SanityCheckPlan;
45use datafusion::physical_plan::ExecutionPlan;
46use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner};
47use datafusion_expr::{AggregateUDF, LogicalPlan as DfLogicalPlan, WindowUDF};
48use datafusion_optimizer::Analyzer;
49use datafusion_optimizer::analyzer::function_rewrite::ApplyFunctionRewrites;
50use datafusion_optimizer::optimizer::Optimizer;
51use partition::manager::PartitionRuleManagerRef;
52use promql::extension_plan::PromExtensionPlanner;
53use table::TableRef;
54use table::table::adapter::DfTableProviderAdapter;
55
56use crate::QueryEngineContext;
57use crate::dist_plan::{
58    DistExtensionPlanner, DistPlannerAnalyzer, DistPlannerOptions, MergeSortExtensionPlanner,
59};
60use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES};
61use crate::optimizer::ExtensionAnalyzerRule;
62use crate::optimizer::const_normalization::ConstNormalizationRule;
63use crate::optimizer::constant_term::MatchesConstantTermOptimizer;
64use crate::optimizer::count_nest_aggr::CountNestAggrRule;
65use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
66use crate::optimizer::json_type_concretize::JsonTypeConcretizeRule;
67use crate::optimizer::parallelize_scan::ParallelizeScan;
68use crate::optimizer::pass_distribution::PassDistribution;
69use crate::optimizer::promql_tsid_narrow_join::PromqlTsidNarrowJoin;
70use crate::optimizer::remove_duplicate::RemoveDuplicate;
71use crate::optimizer::scan_hint::ScanHintRule;
72use crate::optimizer::string_normalization::StringNormalizationRule;
73use crate::optimizer::transcribe_atat::TranscribeAtatRule;
74use crate::optimizer::type_conversion::TypeConversionRule;
75use crate::optimizer::windowed_sort::WindowedSortPhysicalRule;
76use crate::options::QueryOptions as QueryOptionsNew;
77use crate::query_engine::DefaultSerializer;
78use crate::query_engine::options::QueryOptions;
79use crate::range_select::planner::RangeSelectPlanner;
80use crate::region_query::RegionQueryHandlerRef;
81
82/// Query engine global state
83#[derive(Clone)]
84pub struct QueryEngineState {
85    df_context: SessionContext,
86    catalog_manager: CatalogManagerRef,
87    function_state: Arc<FunctionState>,
88    scalar_functions: Arc<RwLock<HashMap<String, ScalarFunctionFactory>>>,
89    aggr_functions: Arc<RwLock<HashMap<String, AggregateUDF>>>,
90    table_functions: Arc<RwLock<HashMap<String, Arc<TableFunction>>>>,
91    extension_rules: Vec<Arc<dyn ExtensionAnalyzerRule + Send + Sync>>,
92    plugins: Plugins,
93}
94
95impl fmt::Debug for QueryEngineState {
96    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
97        f.debug_struct("QueryEngineState")
98            .field("state", &self.df_context.state())
99            .finish()
100    }
101}
102
103impl QueryEngineState {
104    #[allow(clippy::too_many_arguments)]
105    pub fn new(
106        catalog_list: CatalogManagerRef,
107        partition_rule_manager: Option<PartitionRuleManagerRef>,
108        region_query_handler: Option<RegionQueryHandlerRef>,
109        table_mutation_handler: Option<TableMutationHandlerRef>,
110        procedure_service_handler: Option<ProcedureServiceHandlerRef>,
111        flow_service_handler: Option<FlowServiceHandlerRef>,
112        with_dist_planner: bool,
113        plugins: Plugins,
114        options: QueryOptionsNew,
115    ) -> Self {
116        let total_memory = get_total_memory_bytes().max(0) as u64;
117        let memory_pool_size = options.memory_pool_size.resolve(total_memory) as usize;
118        let runtime_env = if memory_pool_size > 0 {
119            Arc::new(
120                RuntimeEnvBuilder::new()
121                    .with_memory_pool(Arc::new(MetricsMemoryPool::new(memory_pool_size)))
122                    .build()
123                    .expect("Failed to build RuntimeEnv"),
124            )
125        } else {
126            Arc::new(RuntimeEnv::default())
127        };
128        let mut session_config = SessionConfig::new().with_create_default_catalog_and_schema(false);
129        if options.parallelism > 0 {
130            session_config = session_config.with_target_partitions(options.parallelism);
131        }
132        if options.allow_query_fallback {
133            session_config
134                .options_mut()
135                .extensions
136                .insert(DistPlannerOptions {
137                    allow_query_fallback: true,
138                });
139        }
140
141        // todo(hl): This serves as a workaround for https://github.com/GreptimeTeam/greptimedb/issues/5659
142        // and we can add that check back once we upgrade datafusion.
143        session_config
144            .options_mut()
145            .execution
146            .skip_physical_aggregate_schema_check = true;
147
148        // Apply extension rules
149        let mut extension_rules = Vec::new();
150
151        // The [`TypeConversionRule`] must be at first
152        extension_rules.insert(0, Arc::new(TypeConversionRule) as _);
153        extension_rules.push(Arc::new(CountNestAggrRule) as _);
154
155        // Apply the datafusion rules
156        let mut analyzer = Analyzer::new();
157        analyzer.rules.insert(0, Arc::new(TranscribeAtatRule));
158        analyzer.rules.insert(0, Arc::new(StringNormalizationRule));
159        analyzer
160            .rules
161            .insert(0, Arc::new(CountWildcardToTimeIndexRule));
162        analyzer.rules.push(Arc::new(ConstNormalizationRule));
163
164        // Add ApplyFunctionRewrites rule,
165        // Note we cannot use `analyzer.add_function_rewrite`
166        // because only rules are copied into session_state
167        analyzer.rules.insert(
168            0,
169            Arc::new(ApplyFunctionRewrites::new(
170                FUNCTION_REGISTRY.function_rewrites(),
171            )),
172        );
173
174        if with_dist_planner {
175            analyzer.rules.push(Arc::new(DistPlannerAnalyzer));
176        }
177        analyzer.rules.push(Arc::new(FixStateUdafOrderingAnalyzer));
178
179        let mut optimizer = Optimizer::new();
180        optimizer.rules.push(Arc::new(ScanHintRule));
181        optimizer.rules.push(Arc::new(JsonTypeConcretizeRule));
182
183        // add physical optimizer
184        let mut physical_optimizer = PhysicalOptimizer::new();
185        // Change TableScan's partition right before enforcing distribution
186        physical_optimizer
187            .rules
188            .insert(5, Arc::new(ParallelizeScan));
189        // Pass distribution requirement to MergeScanExec to avoid unnecessary shuffling
190        physical_optimizer
191            .rules
192            .insert(6, Arc::new(PassDistribution));
193        // Prefer collecting narrow PromQL build sides over repartitioning wide label streams.
194        physical_optimizer
195            .rules
196            .insert(7, Arc::new(PromqlTsidNarrowJoin));
197        // Enforce sorting AFTER custom rules that modify the plan structure
198        physical_optimizer.rules.insert(
199            8,
200            Arc::new(datafusion::physical_optimizer::enforce_sorting::EnforceSorting {}),
201        );
202        // Add rule for windowed sort
203        physical_optimizer
204            .rules
205            .push(Arc::new(WindowedSortPhysicalRule));
206        // explicitly not do filter pushdown for windowed sort&part sort
207        // (notice that `PartSortExec` create another new dyn filter that need to be pushdown if want to use dyn filter optimization)
208        // benchmark shows it can cause performance regression due to useless filtering and extra shuffle.
209        // We can add a rule to do filter pushdown for windowed sort in the future if we find a way to avoid the performance regression.
210        physical_optimizer
211            .rules
212            .push(Arc::new(MatchesConstantTermOptimizer));
213        // Add rule to remove duplicate nodes generated by other rules. Run this in the last.
214        physical_optimizer.rules.push(Arc::new(RemoveDuplicate));
215        // Place SanityCheckPlan at the end of the list to ensure that it runs after all other rules.
216        Self::remove_physical_optimizer_rule(
217            &mut physical_optimizer.rules,
218            SanityCheckPlan {}.name(),
219        );
220        physical_optimizer.rules.push(Arc::new(SanityCheckPlan {}));
221
222        let session_state = SessionStateBuilder::new()
223            .with_config(session_config)
224            .with_runtime_env(runtime_env)
225            .with_default_features()
226            .with_analyzer_rules(analyzer.rules)
227            .with_serializer_registry(Arc::new(DefaultSerializer))
228            .with_query_planner(Arc::new(DfQueryPlanner::new(
229                catalog_list.clone(),
230                partition_rule_manager,
231                region_query_handler,
232            )))
233            .with_optimizer_rules(optimizer.rules)
234            .with_physical_optimizer_rules(physical_optimizer.rules)
235            .build();
236
237        let df_context = SessionContext::new_with_state(session_state);
238        register_function_aliases(&df_context);
239
240        Self {
241            df_context,
242            catalog_manager: catalog_list,
243            function_state: Arc::new(FunctionState {
244                table_mutation_handler,
245                procedure_service_handler,
246                flow_service_handler,
247            }),
248            aggr_functions: Arc::new(RwLock::new(HashMap::new())),
249            table_functions: Arc::new(RwLock::new(HashMap::new())),
250            extension_rules,
251            plugins,
252            scalar_functions: Arc::new(RwLock::new(HashMap::new())),
253        }
254    }
255
256    fn remove_physical_optimizer_rule(
257        rules: &mut Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>>,
258        name: &str,
259    ) {
260        rules.retain(|rule| rule.name() != name);
261    }
262
263    /// Optimize the logical plan by the extension analyzer rules.
264    pub fn optimize_by_extension_rules(
265        &self,
266        plan: DfLogicalPlan,
267        context: &QueryEngineContext,
268    ) -> DfResult<DfLogicalPlan> {
269        self.extension_rules
270            .iter()
271            .try_fold(plan, |acc_plan, rule| {
272                rule.analyze(acc_plan, context, self.session_state().config_options())
273            })
274    }
275
276    /// Run the full logical plan optimize phase for the given plan.
277    pub fn optimize_logical_plan(&self, plan: DfLogicalPlan) -> DfResult<DfLogicalPlan> {
278        self.session_state().optimize(&plan)
279    }
280
281    /// Retrieve the scalar function by name
282    pub fn scalar_function(&self, function_name: &str) -> Option<ScalarFunctionFactory> {
283        self.scalar_functions
284            .read()
285            .unwrap()
286            .get(function_name)
287            .cloned()
288    }
289
290    /// Retrieve scalar function names.
291    pub fn scalar_names(&self) -> Vec<String> {
292        self.scalar_functions
293            .read()
294            .unwrap()
295            .keys()
296            .cloned()
297            .collect()
298    }
299
300    /// Retrieve the aggregate function by name
301    pub fn aggr_function(&self, function_name: &str) -> Option<AggregateUDF> {
302        self.aggr_functions
303            .read()
304            .unwrap()
305            .get(function_name)
306            .cloned()
307    }
308
309    /// Retrieve aggregate function names.
310    pub fn aggr_names(&self) -> Vec<String> {
311        self.aggr_functions
312            .read()
313            .unwrap()
314            .keys()
315            .cloned()
316            .collect()
317    }
318
319    /// Retrieve table function by name
320    pub fn table_function(&self, function_name: &str) -> Option<Arc<TableFunction>> {
321        self.table_functions
322            .read()
323            .unwrap()
324            .get(function_name)
325            .cloned()
326    }
327
328    /// Retrieve table function names.
329    pub fn table_function_names(&self) -> Vec<String> {
330        self.table_functions
331            .read()
332            .unwrap()
333            .keys()
334            .cloned()
335            .collect()
336    }
337
338    /// Register an scalar function.
339    /// Will override if the function with same name is already registered.
340    pub fn register_scalar_function(&self, func: ScalarFunctionFactory) {
341        let name = func.name().to_string();
342        let x = self
343            .scalar_functions
344            .write()
345            .unwrap()
346            .insert(name.clone(), func);
347
348        if x.is_some() {
349            warn!("Already registered scalar function '{name}'");
350        }
351    }
352
353    /// Register an aggregate function.
354    ///
355    /// # Panics
356    /// Will panic if the function with same name is already registered.
357    ///
358    /// Panicking consideration: currently the aggregated functions are all statically registered,
359    /// user cannot define their own aggregate functions on the fly. So we can panic here. If that
360    /// invariant is broken in the future, we should return an error instead of panicking.
361    pub fn register_aggr_function(&self, func: AggregateUDF) {
362        let name = func.name().to_string();
363        let x = self
364            .aggr_functions
365            .write()
366            .unwrap()
367            .insert(name.clone(), func);
368        assert!(
369            x.is_none(),
370            "Already registered aggregate function '{name}'"
371        );
372    }
373
374    pub fn register_table_function(&self, func: Arc<TableFunction>) {
375        let name = func.name();
376        let x = self
377            .table_functions
378            .write()
379            .unwrap()
380            .insert(name.to_string(), func.clone());
381
382        if x.is_some() {
383            warn!("Already registered table function '{name}'");
384        }
385    }
386
387    /// Register a window function (UDWF) directly on the DataFusion SessionContext.
388    ///
389    /// This makes the function visible via `session_state.window_functions()`,
390    /// which is used by `DfContextProviderAdapter::get_window_meta`.
391    pub fn register_window_function(&self, func: WindowUDF) {
392        self.df_context.register_udwf(func);
393    }
394
395    pub fn catalog_manager(&self) -> &CatalogManagerRef {
396        &self.catalog_manager
397    }
398
399    pub fn function_state(&self) -> Arc<FunctionState> {
400        self.function_state.clone()
401    }
402
403    /// Returns the [`TableMutationHandlerRef`] in state.
404    pub fn table_mutation_handler(&self) -> Option<&TableMutationHandlerRef> {
405        self.function_state.table_mutation_handler.as_ref()
406    }
407
408    /// Returns the [`ProcedureServiceHandlerRef`] in state.
409    pub fn procedure_service_handler(&self) -> Option<&ProcedureServiceHandlerRef> {
410        self.function_state.procedure_service_handler.as_ref()
411    }
412
413    pub(crate) fn disallow_cross_catalog_query(&self) -> bool {
414        self.plugins
415            .map::<QueryOptions, _, _>(|x| x.disallow_cross_catalog_query)
416            .unwrap_or(false)
417    }
418
419    pub fn session_state(&self) -> SessionState {
420        self.df_context.state()
421    }
422
423    /// Create a DataFrame for a table
424    pub fn read_table(&self, table: TableRef) -> DfResult<DataFrame> {
425        self.df_context
426            .read_table(Arc::new(DfTableProviderAdapter::new(table)))
427    }
428}
429
430struct DfQueryPlanner {
431    physical_planner: DefaultPhysicalPlanner,
432}
433
434impl fmt::Debug for DfQueryPlanner {
435    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
436        f.debug_struct("DfQueryPlanner").finish()
437    }
438}
439
440#[async_trait]
441impl QueryPlanner for DfQueryPlanner {
442    async fn create_physical_plan(
443        &self,
444        logical_plan: &DfLogicalPlan,
445        session_state: &SessionState,
446    ) -> DfResult<Arc<dyn ExecutionPlan>> {
447        self.physical_planner
448            .create_physical_plan(logical_plan, session_state)
449            .await
450    }
451}
452
453/// MySQL-compatible scalar function aliases: (target_name, alias)
454const SCALAR_FUNCTION_ALIASES: &[(&str, &str)] = &[
455    ("upper", "ucase"),
456    ("lower", "lcase"),
457    ("ceil", "ceiling"),
458    ("substr", "mid"),
459    ("random", "rand"),
460];
461
462/// MySQL-compatible aggregate function aliases: (target_name, alias)
463const AGGREGATE_FUNCTION_ALIASES: &[(&str, &str)] =
464    &[("stddev_pop", "std"), ("var_pop", "variance")];
465
466/// Register function aliases.
467///
468/// This function adds aliases like `ucase` -> `upper`, `lcase` -> `lower`, etc.
469/// to make GreptimeDB more compatible with MySQL syntax.
470fn register_function_aliases(ctx: &SessionContext) {
471    let state = ctx.state();
472
473    for (target, alias) in SCALAR_FUNCTION_ALIASES {
474        if let Some(func) = state.scalar_functions().get(*target) {
475            let aliased = func.as_ref().clone().with_aliases([*alias]);
476            ctx.register_udf(aliased);
477        }
478    }
479
480    for (target, alias) in AGGREGATE_FUNCTION_ALIASES {
481        if let Some(func) = state.aggregate_functions().get(*target) {
482            let aliased = func.as_ref().clone().with_aliases([*alias]);
483            ctx.register_udaf(aliased);
484        }
485    }
486}
487
488impl DfQueryPlanner {
489    fn new(
490        catalog_manager: CatalogManagerRef,
491        partition_rule_manager: Option<PartitionRuleManagerRef>,
492        region_query_handler: Option<RegionQueryHandlerRef>,
493    ) -> Self {
494        let mut planners: Vec<Arc<dyn ExtensionPlanner + Send + Sync>> =
495            vec![Arc::new(PromExtensionPlanner), Arc::new(RangeSelectPlanner)];
496        if let (Some(region_query_handler), Some(partition_rule_manager)) =
497            (region_query_handler, partition_rule_manager)
498        {
499            planners.push(Arc::new(DistExtensionPlanner::new(
500                catalog_manager,
501                partition_rule_manager,
502                region_query_handler,
503            )));
504            planners.push(Arc::new(MergeSortExtensionPlanner {}));
505        }
506        Self {
507            physical_planner: DefaultPhysicalPlanner::with_extension_planners(planners),
508        }
509    }
510}
511
512/// A wrapper around TrackConsumersPool that records metrics.
513///
514/// This wrapper intercepts all memory pool operations and updates
515/// Prometheus metrics for monitoring query memory usage and rejections.
516#[derive(Debug)]
517struct MetricsMemoryPool {
518    inner: Arc<TrackConsumersPool<GreedyMemoryPool>>,
519}
520
521impl MetricsMemoryPool {
522    // Number of top memory consumers to report in OOM error messages
523    const TOP_CONSUMERS_TO_REPORT: usize = 5;
524
525    fn new(limit: usize) -> Self {
526        Self {
527            inner: Arc::new(TrackConsumersPool::new(
528                GreedyMemoryPool::new(limit),
529                NonZeroUsize::new(Self::TOP_CONSUMERS_TO_REPORT).unwrap(),
530            )),
531        }
532    }
533
534    #[inline]
535    fn update_metrics(&self) {
536        QUERY_MEMORY_POOL_USAGE_BYTES.set(self.inner.reserved() as i64);
537    }
538}
539
540impl MemoryPool for MetricsMemoryPool {
541    fn register(&self, consumer: &MemoryConsumer) {
542        self.inner.register(consumer);
543    }
544
545    fn unregister(&self, consumer: &MemoryConsumer) {
546        self.inner.unregister(consumer);
547    }
548
549    fn grow(&self, reservation: &MemoryReservation, additional: usize) {
550        self.inner.grow(reservation, additional);
551        self.update_metrics();
552    }
553
554    fn shrink(&self, reservation: &MemoryReservation, shrink: usize) {
555        self.inner.shrink(reservation, shrink);
556        self.update_metrics();
557    }
558
559    fn try_grow(
560        &self,
561        reservation: &MemoryReservation,
562        additional: usize,
563    ) -> datafusion_common::Result<()> {
564        let result = self.inner.try_grow(reservation, additional);
565        if result.is_err() {
566            QUERY_MEMORY_POOL_REJECTED_TOTAL.inc();
567        }
568        self.update_metrics();
569        result
570    }
571
572    fn reserved(&self) -> usize {
573        self.inner.reserved()
574    }
575
576    fn memory_limit(&self) -> MemoryLimit {
577        self.inner.memory_limit()
578    }
579}